# numpy and pandas for data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import dataprep.eda as eda
import qgrid
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import joblib
bleu = (70 / 255, 130 / 255, 180 / 255)
pd.set_option('display.max_row', 219)
pd.set_option('display.max_column', 243)
pd.set_option('display.max_colwidth', 50)
# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
# File system manangement
import os
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%load_ext autoreload
%autoreload 1
%aimport Functions
from Functions import numerical_summary, categorical_summary, one_hot_encoding_dataframe, correlated_features, frame_vs_target, multi_violin,select_numerical
Loading functions ...
.... done.
Nombre total de fichiers mis à disposition: 9
Fichier principal avec la cible
6 autres fichier contenant des informations sur chaque crédit
2 fichiers utiles pour la submission Kaggle ( inutiles ici)
# List files available
print(os.listdir("../donnees/"))
['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'sample_submission.csv']
# Training data
description = pd.read_csv('../donnees/HomeCredit_columns_description.csv',index_col=0)
print('HomeCredit_columns_description shape: ', description.shape)
description
HomeCredit_columns_description shape: (219, 4)
| Table | Row | Description | Special | |
|---|---|---|---|---|
| 1 | application_{train|test}.csv | SK_ID_CURR | ID of loan in our sample | NaN |
| 2 | application_{train|test}.csv | TARGET | Target variable (1 - client with payment diffi... | NaN |
| 5 | application_{train|test}.csv | NAME_CONTRACT_TYPE | Identification if loan is cash or revolving | NaN |
| 6 | application_{train|test}.csv | CODE_GENDER | Gender of the client | NaN |
| 7 | application_{train|test}.csv | FLAG_OWN_CAR | Flag if the client owns a car | NaN |
| 8 | application_{train|test}.csv | FLAG_OWN_REALTY | Flag if client owns a house or flat | NaN |
| 9 | application_{train|test}.csv | CNT_CHILDREN | Number of children the client has | NaN |
| 10 | application_{train|test}.csv | AMT_INCOME_TOTAL | Income of the client | NaN |
| 11 | application_{train|test}.csv | AMT_CREDIT | Credit amount of the loan | NaN |
| 12 | application_{train|test}.csv | AMT_ANNUITY | Loan annuity | NaN |
| 13 | application_{train|test}.csv | AMT_GOODS_PRICE | For consumer loans it is the price of the good... | NaN |
| 14 | application_{train|test}.csv | NAME_TYPE_SUITE | Who was accompanying client when he was applyi... | NaN |
| 15 | application_{train|test}.csv | NAME_INCOME_TYPE | Clients income type (businessman, working, mat... | NaN |
| 16 | application_{train|test}.csv | NAME_EDUCATION_TYPE | Level of highest education the client achieved | NaN |
| 17 | application_{train|test}.csv | NAME_FAMILY_STATUS | Family status of the client | NaN |
| 18 | application_{train|test}.csv | NAME_HOUSING_TYPE | What is the housing situation of the client (r... | NaN |
| 19 | application_{train|test}.csv | REGION_POPULATION_RELATIVE | Normalized population of region where client l... | normalized |
| 20 | application_{train|test}.csv | DAYS_BIRTH | Client's age in days at the time of application | time only relative to the application |
| 21 | application_{train|test}.csv | DAYS_EMPLOYED | How many days before the application the perso... | time only relative to the application |
| 22 | application_{train|test}.csv | DAYS_REGISTRATION | How many days before the application did clien... | time only relative to the application |
| 23 | application_{train|test}.csv | DAYS_ID_PUBLISH | How many days before the application did clien... | time only relative to the application |
| 24 | application_{train|test}.csv | OWN_CAR_AGE | Age of client's car | NaN |
| 25 | application_{train|test}.csv | FLAG_MOBIL | Did client provide mobile phone (1=YES, 0=NO) | NaN |
| 26 | application_{train|test}.csv | FLAG_EMP_PHONE | Did client provide work phone (1=YES, 0=NO) | NaN |
| 27 | application_{train|test}.csv | FLAG_WORK_PHONE | Did client provide home phone (1=YES, 0=NO) | NaN |
| 28 | application_{train|test}.csv | FLAG_CONT_MOBILE | Was mobile phone reachable (1=YES, 0=NO) | NaN |
| 29 | application_{train|test}.csv | FLAG_PHONE | Did client provide home phone (1=YES, 0=NO) | NaN |
| 30 | application_{train|test}.csv | FLAG_EMAIL | Did client provide email (1=YES, 0=NO) | NaN |
| 31 | application_{train|test}.csv | OCCUPATION_TYPE | What kind of occupation does the client have | NaN |
| 32 | application_{train|test}.csv | CNT_FAM_MEMBERS | How many family members does client have | NaN |
| 33 | application_{train|test}.csv | REGION_RATING_CLIENT | Our rating of the region where client lives (1... | NaN |
| 34 | application_{train|test}.csv | REGION_RATING_CLIENT_W_CITY | Our rating of the region where client lives wi... | NaN |
| 35 | application_{train|test}.csv | WEEKDAY_APPR_PROCESS_START | On which day of the week did the client apply ... | NaN |
| 36 | application_{train|test}.csv | HOUR_APPR_PROCESS_START | Approximately at what hour did the client appl... | rounded |
| 37 | application_{train|test}.csv | REG_REGION_NOT_LIVE_REGION | Flag if client's permanent address does not ma... | NaN |
| 38 | application_{train|test}.csv | REG_REGION_NOT_WORK_REGION | Flag if client's permanent address does not ma... | NaN |
| 39 | application_{train|test}.csv | LIVE_REGION_NOT_WORK_REGION | Flag if client's contact address does not matc... | NaN |
| 40 | application_{train|test}.csv | REG_CITY_NOT_LIVE_CITY | Flag if client's permanent address does not ma... | NaN |
| 41 | application_{train|test}.csv | REG_CITY_NOT_WORK_CITY | Flag if client's permanent address does not ma... | NaN |
| 42 | application_{train|test}.csv | LIVE_CITY_NOT_WORK_CITY | Flag if client's contact address does not matc... | NaN |
| 43 | application_{train|test}.csv | ORGANIZATION_TYPE | Type of organization where client works | NaN |
| 44 | application_{train|test}.csv | EXT_SOURCE_1 | Normalized score from external data source | normalized |
| 45 | application_{train|test}.csv | EXT_SOURCE_2 | Normalized score from external data source | normalized |
| 46 | application_{train|test}.csv | EXT_SOURCE_3 | Normalized score from external data source | normalized |
| 47 | application_{train|test}.csv | APARTMENTS_AVG | Normalized information about building where th... | normalized |
| 48 | application_{train|test}.csv | BASEMENTAREA_AVG | Normalized information about building where th... | normalized |
| 49 | application_{train|test}.csv | YEARS_BEGINEXPLUATATION_AVG | Normalized information about building where th... | normalized |
| 50 | application_{train|test}.csv | YEARS_BUILD_AVG | Normalized information about building where th... | normalized |
| 51 | application_{train|test}.csv | COMMONAREA_AVG | Normalized information about building where th... | normalized |
| 52 | application_{train|test}.csv | ELEVATORS_AVG | Normalized information about building where th... | normalized |
| 53 | application_{train|test}.csv | ENTRANCES_AVG | Normalized information about building where th... | normalized |
| 54 | application_{train|test}.csv | FLOORSMAX_AVG | Normalized information about building where th... | normalized |
| 55 | application_{train|test}.csv | FLOORSMIN_AVG | Normalized information about building where th... | normalized |
| 56 | application_{train|test}.csv | LANDAREA_AVG | Normalized information about building where th... | normalized |
| 57 | application_{train|test}.csv | LIVINGAPARTMENTS_AVG | Normalized information about building where th... | normalized |
| 58 | application_{train|test}.csv | LIVINGAREA_AVG | Normalized information about building where th... | normalized |
| 59 | application_{train|test}.csv | NONLIVINGAPARTMENTS_AVG | Normalized information about building where th... | normalized |
| 60 | application_{train|test}.csv | NONLIVINGAREA_AVG | Normalized information about building where th... | normalized |
| 61 | application_{train|test}.csv | APARTMENTS_MODE | Normalized information about building where th... | normalized |
| 62 | application_{train|test}.csv | BASEMENTAREA_MODE | Normalized information about building where th... | normalized |
| 63 | application_{train|test}.csv | YEARS_BEGINEXPLUATATION_MODE | Normalized information about building where th... | normalized |
| 64 | application_{train|test}.csv | YEARS_BUILD_MODE | Normalized information about building where th... | normalized |
| 65 | application_{train|test}.csv | COMMONAREA_MODE | Normalized information about building where th... | normalized |
| 66 | application_{train|test}.csv | ELEVATORS_MODE | Normalized information about building where th... | normalized |
| 67 | application_{train|test}.csv | ENTRANCES_MODE | Normalized information about building where th... | normalized |
| 68 | application_{train|test}.csv | FLOORSMAX_MODE | Normalized information about building where th... | normalized |
| 69 | application_{train|test}.csv | FLOORSMIN_MODE | Normalized information about building where th... | normalized |
| 70 | application_{train|test}.csv | LANDAREA_MODE | Normalized information about building where th... | normalized |
| 71 | application_{train|test}.csv | LIVINGAPARTMENTS_MODE | Normalized information about building where th... | normalized |
| 72 | application_{train|test}.csv | LIVINGAREA_MODE | Normalized information about building where th... | normalized |
| 73 | application_{train|test}.csv | NONLIVINGAPARTMENTS_MODE | Normalized information about building where th... | normalized |
| 74 | application_{train|test}.csv | NONLIVINGAREA_MODE | Normalized information about building where th... | normalized |
| 75 | application_{train|test}.csv | APARTMENTS_MEDI | Normalized information about building where th... | normalized |
| 76 | application_{train|test}.csv | BASEMENTAREA_MEDI | Normalized information about building where th... | normalized |
| 77 | application_{train|test}.csv | YEARS_BEGINEXPLUATATION_MEDI | Normalized information about building where th... | normalized |
| 78 | application_{train|test}.csv | YEARS_BUILD_MEDI | Normalized information about building where th... | normalized |
| 79 | application_{train|test}.csv | COMMONAREA_MEDI | Normalized information about building where th... | normalized |
| 80 | application_{train|test}.csv | ELEVATORS_MEDI | Normalized information about building where th... | normalized |
| 81 | application_{train|test}.csv | ENTRANCES_MEDI | Normalized information about building where th... | normalized |
| 82 | application_{train|test}.csv | FLOORSMAX_MEDI | Normalized information about building where th... | normalized |
| 83 | application_{train|test}.csv | FLOORSMIN_MEDI | Normalized information about building where th... | normalized |
| 84 | application_{train|test}.csv | LANDAREA_MEDI | Normalized information about building where th... | normalized |
| 85 | application_{train|test}.csv | LIVINGAPARTMENTS_MEDI | Normalized information about building where th... | normalized |
| 86 | application_{train|test}.csv | LIVINGAREA_MEDI | Normalized information about building where th... | normalized |
| 87 | application_{train|test}.csv | NONLIVINGAPARTMENTS_MEDI | Normalized information about building where th... | normalized |
| 88 | application_{train|test}.csv | NONLIVINGAREA_MEDI | Normalized information about building where th... | normalized |
| 89 | application_{train|test}.csv | FONDKAPREMONT_MODE | Normalized information about building where th... | normalized |
| 90 | application_{train|test}.csv | HOUSETYPE_MODE | Normalized information about building where th... | normalized |
| 91 | application_{train|test}.csv | TOTALAREA_MODE | Normalized information about building where th... | normalized |
| 92 | application_{train|test}.csv | WALLSMATERIAL_MODE | Normalized information about building where th... | normalized |
| 93 | application_{train|test}.csv | EMERGENCYSTATE_MODE | Normalized information about building where th... | normalized |
| 94 | application_{train|test}.csv | OBS_30_CNT_SOCIAL_CIRCLE | How many observation of client's social surrou... | NaN |
| 95 | application_{train|test}.csv | DEF_30_CNT_SOCIAL_CIRCLE | How many observation of client's social surrou... | NaN |
| 96 | application_{train|test}.csv | OBS_60_CNT_SOCIAL_CIRCLE | How many observation of client's social surrou... | NaN |
| 97 | application_{train|test}.csv | DEF_60_CNT_SOCIAL_CIRCLE | How many observation of client's social surrou... | NaN |
| 98 | application_{train|test}.csv | DAYS_LAST_PHONE_CHANGE | How many days before application did client ch... | NaN |
| 99 | application_{train|test}.csv | FLAG_DOCUMENT_2 | Did client provide document 2 | NaN |
| 100 | application_{train|test}.csv | FLAG_DOCUMENT_3 | Did client provide document 3 | NaN |
| 101 | application_{train|test}.csv | FLAG_DOCUMENT_4 | Did client provide document 4 | NaN |
| 102 | application_{train|test}.csv | FLAG_DOCUMENT_5 | Did client provide document 5 | NaN |
| 103 | application_{train|test}.csv | FLAG_DOCUMENT_6 | Did client provide document 6 | NaN |
| 104 | application_{train|test}.csv | FLAG_DOCUMENT_7 | Did client provide document 7 | NaN |
| 105 | application_{train|test}.csv | FLAG_DOCUMENT_8 | Did client provide document 8 | NaN |
| 106 | application_{train|test}.csv | FLAG_DOCUMENT_9 | Did client provide document 9 | NaN |
| 107 | application_{train|test}.csv | FLAG_DOCUMENT_10 | Did client provide document 10 | NaN |
| 108 | application_{train|test}.csv | FLAG_DOCUMENT_11 | Did client provide document 11 | NaN |
| 109 | application_{train|test}.csv | FLAG_DOCUMENT_12 | Did client provide document 12 | NaN |
| 110 | application_{train|test}.csv | FLAG_DOCUMENT_13 | Did client provide document 13 | NaN |
| 111 | application_{train|test}.csv | FLAG_DOCUMENT_14 | Did client provide document 14 | NaN |
| 112 | application_{train|test}.csv | FLAG_DOCUMENT_15 | Did client provide document 15 | NaN |
| 113 | application_{train|test}.csv | FLAG_DOCUMENT_16 | Did client provide document 16 | NaN |
| 114 | application_{train|test}.csv | FLAG_DOCUMENT_17 | Did client provide document 17 | NaN |
| 115 | application_{train|test}.csv | FLAG_DOCUMENT_18 | Did client provide document 18 | NaN |
| 116 | application_{train|test}.csv | FLAG_DOCUMENT_19 | Did client provide document 19 | NaN |
| 117 | application_{train|test}.csv | FLAG_DOCUMENT_20 | Did client provide document 20 | NaN |
| 118 | application_{train|test}.csv | FLAG_DOCUMENT_21 | Did client provide document 21 | NaN |
| 119 | application_{train|test}.csv | AMT_REQ_CREDIT_BUREAU_HOUR | Number of enquiries to Credit Bureau about the... | NaN |
| 120 | application_{train|test}.csv | AMT_REQ_CREDIT_BUREAU_DAY | Number of enquiries to Credit Bureau about the... | NaN |
| 121 | application_{train|test}.csv | AMT_REQ_CREDIT_BUREAU_WEEK | Number of enquiries to Credit Bureau about the... | NaN |
| 122 | application_{train|test}.csv | AMT_REQ_CREDIT_BUREAU_MON | Number of enquiries to Credit Bureau about the... | NaN |
| 123 | application_{train|test}.csv | AMT_REQ_CREDIT_BUREAU_QRT | Number of enquiries to Credit Bureau about the... | NaN |
| 124 | application_{train|test}.csv | AMT_REQ_CREDIT_BUREAU_YEAR | Number of enquiries to Credit Bureau about the... | NaN |
| 125 | bureau.csv | SK_ID_CURR | ID of loan in our sample - one loan in our sam... | hashed |
| 126 | bureau.csv | SK_BUREAU_ID | Recoded ID of previous Credit Bureau credit re... | hashed |
| 127 | bureau.csv | CREDIT_ACTIVE | Status of the Credit Bureau (CB) reported credits | NaN |
| 128 | bureau.csv | CREDIT_CURRENCY | Recoded currency of the Credit Bureau credit | recoded |
| 129 | bureau.csv | DAYS_CREDIT | How many days before current application did c... | time only relative to the application |
| 130 | bureau.csv | CREDIT_DAY_OVERDUE | Number of days past due on CB credit at the ti... | NaN |
| 131 | bureau.csv | DAYS_CREDIT_ENDDATE | Remaining duration of CB credit (in days) at t... | time only relative to the application |
| 132 | bureau.csv | DAYS_ENDDATE_FACT | Days since CB credit ended at the time of appl... | time only relative to the application |
| 133 | bureau.csv | AMT_CREDIT_MAX_OVERDUE | Maximal amount overdue on the Credit Bureau cr... | NaN |
| 134 | bureau.csv | CNT_CREDIT_PROLONG | How many times was the Credit Bureau credit pr... | NaN |
| 135 | bureau.csv | AMT_CREDIT_SUM | Current credit amount for the Credit Bureau cr... | NaN |
| 136 | bureau.csv | AMT_CREDIT_SUM_DEBT | Current debt on Credit Bureau credit | NaN |
| 137 | bureau.csv | AMT_CREDIT_SUM_LIMIT | Current credit limit of credit card reported i... | NaN |
| 138 | bureau.csv | AMT_CREDIT_SUM_OVERDUE | Current amount overdue on Credit Bureau credit | NaN |
| 139 | bureau.csv | CREDIT_TYPE | Type of Credit Bureau credit (Car, cash,...) | NaN |
| 140 | bureau.csv | DAYS_CREDIT_UPDATE | How many days before loan application did last... | time only relative to the application |
| 141 | bureau.csv | AMT_ANNUITY | Annuity of the Credit Bureau credit | NaN |
| 142 | bureau_balance.csv | SK_BUREAU_ID | Recoded ID of Credit Bureau credit (unique cod... | hashed |
| 143 | bureau_balance.csv | MONTHS_BALANCE | Month of balance relative to application date ... | time only relative to the application |
| 144 | bureau_balance.csv | STATUS | Status of Credit Bureau loan during the month ... | NaN |
| 145 | POS_CASH_balance.csv | SK_ID_PREV | ID of previous credit in Home Credit related t... | NaN |
| 146 | POS_CASH_balance.csv | SK_ID_CURR | ID of loan in our sample | NaN |
| 147 | POS_CASH_balance.csv | MONTHS_BALANCE | Month of balance relative to application date ... | time only relative to the application |
| 148 | POS_CASH_balance.csv | CNT_INSTALMENT | Term of previous credit (can change over time) | NaN |
| 149 | POS_CASH_balance.csv | CNT_INSTALMENT_FUTURE | Installments left to pay on the previous credit | NaN |
| 150 | POS_CASH_balance.csv | NAME_CONTRACT_STATUS | Contract status during the month | NaN |
| 151 | POS_CASH_balance.csv | SK_DPD | DPD (days past due) during the month of previo... | NaN |
| 152 | POS_CASH_balance.csv | SK_DPD_DEF | DPD during the month with tolerance (debts wit... | NaN |
| 153 | credit_card_balance.csv | SK_ID_PREV | ID of previous credit in Home credit related t... | hashed |
| 154 | credit_card_balance.csv | SK_ID_CURR | ID of loan in our sample | hashed |
| 155 | credit_card_balance.csv | MONTHS_BALANCE | Month of balance relative to application date ... | time only relative to the application |
| 156 | credit_card_balance.csv | AMT_BALANCE | Balance during the month of previous credit | NaN |
| 157 | credit_card_balance.csv | AMT_CREDIT_LIMIT_ACTUAL | Credit card limit during the month of the prev... | NaN |
| 158 | credit_card_balance.csv | AMT_DRAWINGS_ATM_CURRENT | Amount drawing at ATM during the month of the ... | NaN |
| 159 | credit_card_balance.csv | AMT_DRAWINGS_CURRENT | Amount drawing during the month of the previou... | NaN |
| 160 | credit_card_balance.csv | AMT_DRAWINGS_OTHER_CURRENT | Amount of other drawings during the month of t... | NaN |
| 161 | credit_card_balance.csv | AMT_DRAWINGS_POS_CURRENT | Amount drawing or buying goods during the mont... | NaN |
| 162 | credit_card_balance.csv | AMT_INST_MIN_REGULARITY | Minimal installment for this month of the prev... | NaN |
| 163 | credit_card_balance.csv | AMT_PAYMENT_CURRENT | How much did the client pay during the month o... | NaN |
| 164 | credit_card_balance.csv | AMT_PAYMENT_TOTAL_CURRENT | How much did the client pay during the month i... | NaN |
| 165 | credit_card_balance.csv | AMT_RECEIVABLE_PRINCIPAL | Amount receivable for principal on the previou... | NaN |
| 166 | credit_card_balance.csv | AMT_RECIVABLE | Amount receivable on the previous credit | NaN |
| 167 | credit_card_balance.csv | AMT_TOTAL_RECEIVABLE | Total amount receivable on the previous credit | NaN |
| 168 | credit_card_balance.csv | CNT_DRAWINGS_ATM_CURRENT | Number of drawings at ATM during this month on... | NaN |
| 169 | credit_card_balance.csv | CNT_DRAWINGS_CURRENT | Number of drawings during this month on the pr... | NaN |
| 170 | credit_card_balance.csv | CNT_DRAWINGS_OTHER_CURRENT | Number of other drawings during this month on ... | NaN |
| 171 | credit_card_balance.csv | CNT_DRAWINGS_POS_CURRENT | Number of drawings for goods during this month... | NaN |
| 172 | credit_card_balance.csv | CNT_INSTALMENT_MATURE_CUM | Number of paid installments on the previous cr... | NaN |
| 173 | credit_card_balance.csv | NAME_CONTRACT_STATUS | Contract status (active signed,...) on the pre... | NaN |
| 174 | credit_card_balance.csv | SK_DPD | DPD (Days past due) during the month on the pr... | NaN |
| 175 | credit_card_balance.csv | SK_DPD_DEF | DPD (Days past due) during the month with tole... | NaN |
| 176 | previous_application.csv | SK_ID_PREV | ID of previous credit in Home credit related t... | hashed |
| 177 | previous_application.csv | SK_ID_CURR | ID of loan in our sample | hashed |
| 178 | previous_application.csv | NAME_CONTRACT_TYPE | Contract product type (Cash loan, consumer loa... | NaN |
| 179 | previous_application.csv | AMT_ANNUITY | Annuity of previous application | NaN |
| 180 | previous_application.csv | AMT_APPLICATION | For how much credit did client ask on the prev... | NaN |
| 181 | previous_application.csv | AMT_CREDIT | Final credit amount on the previous applicatio... | NaN |
| 182 | previous_application.csv | AMT_DOWN_PAYMENT | Down payment on the previous application | NaN |
| 183 | previous_application.csv | AMT_GOODS_PRICE | Goods price of good that client asked for (if ... | NaN |
| 184 | previous_application.csv | WEEKDAY_APPR_PROCESS_START | On which day of the week did the client apply ... | NaN |
| 185 | previous_application.csv | HOUR_APPR_PROCESS_START | Approximately at what day hour did the client ... | rounded |
| 186 | previous_application.csv | FLAG_LAST_APPL_PER_CONTRACT | Flag if it was last application for the previo... | NaN |
| 187 | previous_application.csv | NFLAG_LAST_APPL_IN_DAY | Flag if the application was the last applicati... | NaN |
| 188 | previous_application.csv | NFLAG_MICRO_CASH | Flag Micro finance loan | NaN |
| 189 | previous_application.csv | RATE_DOWN_PAYMENT | Down payment rate normalized on previous credit | normalized |
| 190 | previous_application.csv | RATE_INTEREST_PRIMARY | Interest rate normalized on previous credit | normalized |
| 191 | previous_application.csv | RATE_INTEREST_PRIVILEGED | Interest rate normalized on previous credit | normalized |
| 192 | previous_application.csv | NAME_CASH_LOAN_PURPOSE | Purpose of the cash loan | NaN |
| 193 | previous_application.csv | NAME_CONTRACT_STATUS | Contract status (approved, cancelled, ...) of ... | NaN |
| 194 | previous_application.csv | DAYS_DECISION | Relative to current application when was the d... | time only relative to the application |
| 195 | previous_application.csv | NAME_PAYMENT_TYPE | Payment method that client chose to pay for th... | NaN |
| 196 | previous_application.csv | CODE_REJECT_REASON | Why was the previous application rejected | NaN |
| 197 | previous_application.csv | NAME_TYPE_SUITE | Who accompanied client when applying for the p... | NaN |
| 198 | previous_application.csv | NAME_CLIENT_TYPE | Was the client old or new client when applying... | NaN |
| 199 | previous_application.csv | NAME_GOODS_CATEGORY | What kind of goods did the client apply for in... | NaN |
| 200 | previous_application.csv | NAME_PORTFOLIO | Was the previous application for CASH, POS, CA... | NaN |
| 201 | previous_application.csv | NAME_PRODUCT_TYPE | Was the previous application x-sell o walk-in | NaN |
| 202 | previous_application.csv | CHANNEL_TYPE | Through which channel we acquired the client o... | NaN |
| 203 | previous_application.csv | SELLERPLACE_AREA | Selling area of seller place of the previous a... | NaN |
| 204 | previous_application.csv | NAME_SELLER_INDUSTRY | The industry of the seller | NaN |
| 205 | previous_application.csv | CNT_PAYMENT | Term of previous credit at application of the ... | NaN |
| 206 | previous_application.csv | NAME_YIELD_GROUP | Grouped interest rate into small medium and hi... | grouped |
| 207 | previous_application.csv | PRODUCT_COMBINATION | Detailed product combination of the previous a... | NaN |
| 208 | previous_application.csv | DAYS_FIRST_DRAWING | Relative to application date of current applic... | time only relative to the application |
| 209 | previous_application.csv | DAYS_FIRST_DUE | Relative to application date of current applic... | time only relative to the application |
| 210 | previous_application.csv | DAYS_LAST_DUE_1ST_VERSION | Relative to application date of current applic... | time only relative to the application |
| 211 | previous_application.csv | DAYS_LAST_DUE | Relative to application date of current applic... | time only relative to the application |
| 212 | previous_application.csv | DAYS_TERMINATION | Relative to application date of current applic... | time only relative to the application |
| 213 | previous_application.csv | NFLAG_INSURED_ON_APPROVAL | Did the client requested insurance during the ... | NaN |
| 214 | installments_payments.csv | SK_ID_PREV | ID of previous credit in Home credit related t... | hashed |
| 215 | installments_payments.csv | SK_ID_CURR | ID of loan in our sample | hashed |
| 216 | installments_payments.csv | NUM_INSTALMENT_VERSION | Version of installment calendar (0 is for cred... | NaN |
| 217 | installments_payments.csv | NUM_INSTALMENT_NUMBER | On which installment we observe payment | NaN |
| 218 | installments_payments.csv | DAYS_INSTALMENT | When the installment of previous credit was su... | time only relative to the application |
| 219 | installments_payments.csv | DAYS_ENTRY_PAYMENT | When was the installments of previous credit p... | time only relative to the application |
| 220 | installments_payments.csv | AMT_INSTALMENT | What was the prescribed installment amount of ... | NaN |
| 221 | installments_payments.csv | AMT_PAYMENT | What the client actually paid on previous cred... | NaN |
# Training data
app_train = pd.read_csv('../donnees/application_train.csv',index_col=0)
print('Training data shape: ', app_train.shape)
app_train.head()
Training data shape: (307511, 121)
| TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | reg oper account | block of flats | 0.0149 | Stone, brick | No | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | reg oper account | block of flats | 0.0714 | Block | No | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Core staff | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
import dtale
dtale.show(app_train, host='localhost')
307511 observations ( credits différents) et 120 variables et TARGET ( la prédiction quand au remboursement ou défaut de remboursement du prêt).
The target is what we are asked to predict: either a 0 for the loan was repaid on time, or a 1 indicating the client had payment difficulties. We can first examine the number of loans falling into each category.
app_train['TARGET'].value_counts()
0 282686 1 24825 Name: TARGET, dtype: int64
# Class ratio
print('TARGET ratio {:.8%}'.format(app_train.TARGET.value_counts()[1]/app_train.TARGET.value_counts()[0]) )
TARGET ratio 8.78182860%
app_train['TARGET'].astype(int).plot.hist();
From this information, we see this is an imbalanced class problem. There are far more loans that were repaid on time than loans that were not repaid.
# Summary with correlation to TARGET and missing values
numerical = numerical_summary(app_train.select_dtypes('number'))
# qgrid.QgridWidget(df=numerical)
numerical
Data Frame a 105 colonnes. Dont 105 colonnes contiennent des valeurs manquantes.
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| EXT_SOURCE_3 | -0.18 | 60965 | 19.83 |
| EXT_SOURCE_2 | -0.16 | 660 | 0.21 |
| EXT_SOURCE_1 | -0.16 | 173378 | 56.38 |
| DAYS_EMPLOYED | -0.04 | 0 | 0.00 |
| FLOORSMAX_AVG | -0.04 | 153020 | 49.76 |
| FLOORSMAX_MEDI | -0.04 | 153020 | 49.76 |
| FLOORSMAX_MODE | -0.04 | 153020 | 49.76 |
| AMT_GOODS_PRICE | -0.04 | 278 | 0.09 |
| REGION_POPULATION_RELATIVE | -0.04 | 0 | 0.00 |
| ELEVATORS_AVG | -0.03 | 163891 | 53.30 |
| ELEVATORS_MEDI | -0.03 | 163891 | 53.30 |
| FLOORSMIN_AVG | -0.03 | 208642 | 67.85 |
| FLOORSMIN_MEDI | -0.03 | 208642 | 67.85 |
| LIVINGAREA_AVG | -0.03 | 154350 | 50.19 |
| LIVINGAREA_MEDI | -0.03 | 154350 | 50.19 |
| FLOORSMIN_MODE | -0.03 | 208642 | 67.85 |
| TOTALAREA_MODE | -0.03 | 148431 | 48.27 |
| ELEVATORS_MODE | -0.03 | 163891 | 53.30 |
| LIVINGAREA_MODE | -0.03 | 154350 | 50.19 |
| AMT_CREDIT | -0.03 | 0 | 0.00 |
| APARTMENTS_AVG | -0.03 | 156061 | 50.75 |
| APARTMENTS_MEDI | -0.03 | 156061 | 50.75 |
| FLAG_DOCUMENT_6 | -0.03 | 0 | 0.00 |
| APARTMENTS_MODE | -0.03 | 156061 | 50.75 |
| LIVINGAPARTMENTS_AVG | -0.03 | 210199 | 68.35 |
| LIVINGAPARTMENTS_MEDI | -0.02 | 210199 | 68.35 |
| HOUR_APPR_PROCESS_START | -0.02 | 0 | 0.00 |
| FLAG_PHONE | -0.02 | 0 | 0.00 |
| LIVINGAPARTMENTS_MODE | -0.02 | 210199 | 68.35 |
| BASEMENTAREA_AVG | -0.02 | 179943 | 58.52 |
| YEARS_BUILD_MEDI | -0.02 | 204488 | 66.50 |
| YEARS_BUILD_AVG | -0.02 | 204488 | 66.50 |
| BASEMENTAREA_MEDI | -0.02 | 179943 | 58.52 |
| YEARS_BUILD_MODE | -0.02 | 204488 | 66.50 |
| BASEMENTAREA_MODE | -0.02 | 179943 | 58.52 |
| ENTRANCES_AVG | -0.02 | 154828 | 50.35 |
| ENTRANCES_MEDI | -0.02 | 154828 | 50.35 |
| COMMONAREA_MEDI | -0.02 | 214865 | 69.87 |
| COMMONAREA_AVG | -0.02 | 214865 | 69.87 |
| ENTRANCES_MODE | -0.02 | 154828 | 50.35 |
| COMMONAREA_MODE | -0.02 | 214865 | 69.87 |
| NONLIVINGAREA_AVG | -0.01 | 169682 | 55.18 |
| NONLIVINGAREA_MEDI | -0.01 | 169682 | 55.18 |
| AMT_ANNUITY | -0.01 | 12 | 0.00 |
| NONLIVINGAREA_MODE | -0.01 | 169682 | 55.18 |
| AMT_REQ_CREDIT_BUREAU_MON | -0.01 | 41519 | 13.50 |
| FLAG_DOCUMENT_16 | -0.01 | 0 | 0.00 |
| FLAG_DOCUMENT_13 | -0.01 | 0 | 0.00 |
| LANDAREA_MEDI | -0.01 | 182590 | 59.38 |
| LANDAREA_AVG | -0.01 | 182590 | 59.38 |
| LANDAREA_MODE | -0.01 | 182590 | 59.38 |
| YEARS_BEGINEXPLUATATION_MEDI | -0.01 | 150007 | 48.78 |
| YEARS_BEGINEXPLUATATION_AVG | -0.01 | 150007 | 48.78 |
| FLAG_DOCUMENT_14 | -0.01 | 0 | 0.00 |
| YEARS_BEGINEXPLUATATION_MODE | -0.01 | 150007 | 48.78 |
| FLAG_DOCUMENT_8 | -0.01 | 0 | 0.00 |
| FLAG_DOCUMENT_18 | -0.01 | 0 | 0.00 |
| FLAG_DOCUMENT_15 | -0.01 | 0 | 0.00 |
| FLAG_DOCUMENT_9 | -0.00 | 0 | 0.00 |
| FLAG_DOCUMENT_11 | -0.00 | 0 | 0.00 |
| AMT_INCOME_TOTAL | -0.00 | 0 | 0.00 |
| FLAG_DOCUMENT_17 | -0.00 | 0 | 0.00 |
| NONLIVINGAPARTMENTS_AVG | -0.00 | 213514 | 69.43 |
| NONLIVINGAPARTMENTS_MEDI | -0.00 | 213514 | 69.43 |
| FLAG_DOCUMENT_4 | -0.00 | 0 | 0.00 |
| AMT_REQ_CREDIT_BUREAU_QRT | -0.00 | 41519 | 13.50 |
| FLAG_EMAIL | -0.00 | 0 | 0.00 |
| NONLIVINGAPARTMENTS_MODE | -0.00 | 213514 | 69.43 |
| FLAG_DOCUMENT_7 | -0.00 | 0 | 0.00 |
| FLAG_DOCUMENT_10 | -0.00 | 0 | 0.00 |
| FLAG_DOCUMENT_19 | -0.00 | 0 | 0.00 |
| FLAG_DOCUMENT_12 | -0.00 | 0 | 0.00 |
| FLAG_DOCUMENT_5 | -0.00 | 0 | 0.00 |
| FLAG_DOCUMENT_20 | 0.00 | 0 | 0.00 |
| FLAG_CONT_MOBILE | 0.00 | 0 | 0.00 |
| FLAG_MOBIL | 0.00 | 0 | 0.00 |
| AMT_REQ_CREDIT_BUREAU_WEEK | 0.00 | 41519 | 13.50 |
| AMT_REQ_CREDIT_BUREAU_HOUR | 0.00 | 41519 | 13.50 |
| AMT_REQ_CREDIT_BUREAU_DAY | 0.00 | 41519 | 13.50 |
| LIVE_REGION_NOT_WORK_REGION | 0.00 | 0 | 0.00 |
| FLAG_DOCUMENT_21 | 0.00 | 0 | 0.00 |
| FLAG_DOCUMENT_2 | 0.01 | 0 | 0.00 |
| REG_REGION_NOT_LIVE_REGION | 0.01 | 0 | 0.00 |
| REG_REGION_NOT_WORK_REGION | 0.01 | 0 | 0.00 |
| OBS_60_CNT_SOCIAL_CIRCLE | 0.01 | 1021 | 0.33 |
| OBS_30_CNT_SOCIAL_CIRCLE | 0.01 | 1021 | 0.33 |
| CNT_FAM_MEMBERS | 0.01 | 2 | 0.00 |
| CNT_CHILDREN | 0.02 | 0 | 0.00 |
| AMT_REQ_CREDIT_BUREAU_YEAR | 0.02 | 41519 | 13.50 |
| FLAG_WORK_PHONE | 0.03 | 0 | 0.00 |
| DEF_60_CNT_SOCIAL_CIRCLE | 0.03 | 1021 | 0.33 |
| DEF_30_CNT_SOCIAL_CIRCLE | 0.03 | 1021 | 0.33 |
| LIVE_CITY_NOT_WORK_CITY | 0.03 | 0 | 0.00 |
| OWN_CAR_AGE | 0.04 | 202929 | 65.99 |
| DAYS_REGISTRATION | 0.04 | 0 | 0.00 |
| FLAG_DOCUMENT_3 | 0.04 | 0 | 0.00 |
| REG_CITY_NOT_LIVE_CITY | 0.04 | 0 | 0.00 |
| FLAG_EMP_PHONE | 0.05 | 0 | 0.00 |
| REG_CITY_NOT_WORK_CITY | 0.05 | 0 | 0.00 |
| DAYS_ID_PUBLISH | 0.05 | 0 | 0.00 |
| DAYS_LAST_PHONE_CHANGE | 0.06 | 1 | 0.00 |
| REGION_RATING_CLIENT | 0.06 | 0 | 0.00 |
| REGION_RATING_CLIENT_W_CITY | 0.06 | 0 | 0.00 |
| DAYS_BIRTH | 0.08 | 0 | 0.00 |
| TARGET | 1.00 | 0 | 0.00 |
# correlation threshold
most_corr = numerical.loc[abs(numerical.correlations) > 0.03]
# qgrid.QgridWidget(df=most_corr)
most_corr
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| EXT_SOURCE_3 | -0.18 | 60965 | 19.83 |
| EXT_SOURCE_2 | -0.16 | 660 | 0.21 |
| EXT_SOURCE_1 | -0.16 | 173378 | 56.38 |
| DAYS_EMPLOYED | -0.04 | 0 | 0.00 |
| FLOORSMAX_AVG | -0.04 | 153020 | 49.76 |
| FLOORSMAX_MEDI | -0.04 | 153020 | 49.76 |
| FLOORSMAX_MODE | -0.04 | 153020 | 49.76 |
| AMT_GOODS_PRICE | -0.04 | 278 | 0.09 |
| REGION_POPULATION_RELATIVE | -0.04 | 0 | 0.00 |
| OWN_CAR_AGE | 0.04 | 202929 | 65.99 |
| DAYS_REGISTRATION | 0.04 | 0 | 0.00 |
| FLAG_DOCUMENT_3 | 0.04 | 0 | 0.00 |
| REG_CITY_NOT_LIVE_CITY | 0.04 | 0 | 0.00 |
| FLAG_EMP_PHONE | 0.05 | 0 | 0.00 |
| REG_CITY_NOT_WORK_CITY | 0.05 | 0 | 0.00 |
| DAYS_ID_PUBLISH | 0.05 | 0 | 0.00 |
| DAYS_LAST_PHONE_CHANGE | 0.06 | 1 | 0.00 |
| REGION_RATING_CLIENT | 0.06 | 0 | 0.00 |
| REGION_RATING_CLIENT_W_CITY | 0.06 | 0 | 0.00 |
| DAYS_BIRTH | 0.08 | 0 | 0.00 |
| TARGET | 1.00 | 0 | 0.00 |
# missing values threshold
most_corr_less_miss = most_corr.loc[most_corr['%_total'] < 60]
# display(qgrid.QgridWidget(df=most_corr_less_miss))
display(most_corr_less_miss)
most_corr_less_miss_cols = list(most_corr_less_miss.index)
most_corr_less_miss_cols
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| EXT_SOURCE_3 | -0.18 | 60965 | 19.83 |
| EXT_SOURCE_2 | -0.16 | 660 | 0.21 |
| EXT_SOURCE_1 | -0.16 | 173378 | 56.38 |
| DAYS_EMPLOYED | -0.04 | 0 | 0.00 |
| FLOORSMAX_AVG | -0.04 | 153020 | 49.76 |
| FLOORSMAX_MEDI | -0.04 | 153020 | 49.76 |
| FLOORSMAX_MODE | -0.04 | 153020 | 49.76 |
| AMT_GOODS_PRICE | -0.04 | 278 | 0.09 |
| REGION_POPULATION_RELATIVE | -0.04 | 0 | 0.00 |
| DAYS_REGISTRATION | 0.04 | 0 | 0.00 |
| FLAG_DOCUMENT_3 | 0.04 | 0 | 0.00 |
| REG_CITY_NOT_LIVE_CITY | 0.04 | 0 | 0.00 |
| FLAG_EMP_PHONE | 0.05 | 0 | 0.00 |
| REG_CITY_NOT_WORK_CITY | 0.05 | 0 | 0.00 |
| DAYS_ID_PUBLISH | 0.05 | 0 | 0.00 |
| DAYS_LAST_PHONE_CHANGE | 0.06 | 1 | 0.00 |
| REGION_RATING_CLIENT | 0.06 | 0 | 0.00 |
| REGION_RATING_CLIENT_W_CITY | 0.06 | 0 | 0.00 |
| DAYS_BIRTH | 0.08 | 0 | 0.00 |
| TARGET | 1.00 | 0 | 0.00 |
['EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1', 'DAYS_EMPLOYED', 'FLOORSMAX_AVG', 'FLOORSMAX_MEDI', 'FLOORSMAX_MODE', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'FLAG_DOCUMENT_3', 'REG_CITY_NOT_LIVE_CITY', 'FLAG_EMP_PHONE', 'REG_CITY_NOT_WORK_CITY', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'DAYS_BIRTH', 'TARGET']
app_trainq uick view of distributions
# distributions by target
frame_vs_target(app_train[most_corr_less_miss_cols].drop('TARGET',axis=1),app_train.TARGET,'All numerical variables selected from application_train by TARGET value ')
# same with violin plots
multi_violin(app_train[most_corr_less_miss_cols])
data = app_train.copy()
data.describe()
| TARGET | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | TOTALAREA_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 307511.000000 | 307511.000000 | 3.075110e+05 | 3.075110e+05 | 307499.000000 | 3.072330e+05 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 104582.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307509.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 134133.000000 | 3.068510e+05 | 246546.000000 | 151450.00000 | 127568.000000 | 157504.000000 | 103023.000000 | 92646.000000 | 143620.000000 | 152683.000000 | 154491.000000 | 98869.000000 | 124921.000000 | 97312.000000 | 153161.000000 | 93997.000000 | 137829.000000 | 151450.000000 | 127568.000000 | 157504.000000 | 103023.000000 | 92646.000000 | 143620.000000 | 152683.000000 | 154491.000000 | 98869.000000 | 124921.000000 | 97312.000000 | 153161.000000 | 93997.000000 | 137829.000000 | 151450.000000 | 127568.000000 | 157504.000000 | 103023.000000 | 92646.000000 | 143620.000000 | 152683.000000 | 154491.000000 | 98869.000000 | 124921.000000 | 97312.000000 | 153161.000000 | 93997.000000 | 137829.000000 | 159080.000000 | 306490.000000 | 306490.000000 | 306490.000000 | 306490.000000 | 307510.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.00000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 265992.000000 | 265992.000000 | 265992.000000 | 265992.000000 | 265992.000000 | 265992.000000 |
| mean | 0.080729 | 0.417052 | 1.687979e+05 | 5.990260e+05 | 27108.573909 | 5.383962e+05 | 0.020868 | -16036.995067 | 63815.045904 | -4986.120328 | -2994.202373 | 12.061091 | 0.999997 | 0.819889 | 0.199368 | 0.998133 | 0.281066 | 0.056720 | 2.152665 | 2.052463 | 2.031521 | 12.063419 | 0.015144 | 0.050769 | 0.040659 | 0.078173 | 0.230454 | 0.179555 | 0.502130 | 5.143927e-01 | 0.510853 | 0.11744 | 0.088442 | 0.977735 | 0.752471 | 0.044621 | 0.078942 | 0.149725 | 0.226282 | 0.231894 | 0.066333 | 0.100775 | 0.107399 | 0.008809 | 0.028358 | 0.114231 | 0.087543 | 0.977065 | 0.759637 | 0.042553 | 0.074490 | 0.145193 | 0.222315 | 0.228058 | 0.064958 | 0.105645 | 0.105975 | 0.008076 | 0.027022 | 0.117850 | 0.087955 | 0.977752 | 0.755746 | 0.044595 | 0.078078 | 0.149213 | 0.225897 | 0.231625 | 0.067169 | 0.101954 | 0.108607 | 0.008651 | 0.028236 | 0.102547 | 1.422245 | 0.143421 | 1.405292 | 0.100049 | -962.858788 | 0.000042 | 0.710023 | 0.000081 | 0.015115 | 0.088055 | 0.000192 | 0.081376 | 0.003896 | 0.000023 | 0.003912 | 0.000007 | 0.003525 | 0.002936 | 0.00121 | 0.009928 | 0.000267 | 0.008130 | 0.000595 | 0.000507 | 0.000335 | 0.006402 | 0.007000 | 0.034362 | 0.267395 | 0.265474 | 1.899974 |
| std | 0.272419 | 0.722121 | 2.371231e+05 | 4.024908e+05 | 14493.737315 | 3.694465e+05 | 0.013831 | 4363.988632 | 141275.766519 | 3522.886321 | 1509.450419 | 11.944812 | 0.001803 | 0.384280 | 0.399526 | 0.043164 | 0.449521 | 0.231307 | 0.910682 | 0.509034 | 0.502737 | 3.265832 | 0.122126 | 0.219526 | 0.197499 | 0.268444 | 0.421124 | 0.383817 | 0.211062 | 1.910602e-01 | 0.194844 | 0.10824 | 0.082438 | 0.059223 | 0.113280 | 0.076036 | 0.134576 | 0.100049 | 0.144641 | 0.161380 | 0.081184 | 0.092576 | 0.110565 | 0.047732 | 0.069523 | 0.107936 | 0.084307 | 0.064575 | 0.110111 | 0.074445 | 0.132256 | 0.100977 | 0.143709 | 0.161160 | 0.081750 | 0.097880 | 0.111845 | 0.046276 | 0.070254 | 0.109076 | 0.082179 | 0.059897 | 0.112066 | 0.076144 | 0.134467 | 0.100368 | 0.145067 | 0.161934 | 0.082167 | 0.093642 | 0.112260 | 0.047415 | 0.070166 | 0.107462 | 2.400989 | 0.446698 | 2.379803 | 0.362291 | 826.808487 | 0.006502 | 0.453752 | 0.009016 | 0.122010 | 0.283376 | 0.013850 | 0.273412 | 0.062295 | 0.004771 | 0.062424 | 0.002550 | 0.059268 | 0.054110 | 0.03476 | 0.099144 | 0.016327 | 0.089798 | 0.024387 | 0.022518 | 0.018299 | 0.083849 | 0.110757 | 0.204685 | 0.916002 | 0.794056 | 1.869295 |
| min | 0.000000 | 0.000000 | 2.565000e+04 | 4.500000e+04 | 1615.500000 | 4.050000e+04 | 0.000290 | -25229.000000 | -17912.000000 | -24672.000000 | -7197.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.014568 | 8.173617e-08 | 0.000527 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -4292.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 1.125000e+05 | 2.700000e+05 | 16524.000000 | 2.385000e+05 | 0.010006 | -19682.000000 | -2760.000000 | -7479.500000 | -4299.000000 | 5.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 2.000000 | 2.000000 | 2.000000 | 10.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.334007 | 3.924574e-01 | 0.370650 | 0.05770 | 0.044200 | 0.976700 | 0.687200 | 0.007800 | 0.000000 | 0.069000 | 0.166700 | 0.083300 | 0.018700 | 0.050400 | 0.045300 | 0.000000 | 0.000000 | 0.052500 | 0.040700 | 0.976700 | 0.699400 | 0.007200 | 0.000000 | 0.069000 | 0.166700 | 0.083300 | 0.016600 | 0.054200 | 0.042700 | 0.000000 | 0.000000 | 0.058300 | 0.043700 | 0.976700 | 0.691400 | 0.007900 | 0.000000 | 0.069000 | 0.166700 | 0.083300 | 0.018700 | 0.051300 | 0.045700 | 0.000000 | 0.000000 | 0.041200 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1570.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 1.471500e+05 | 5.135310e+05 | 24903.000000 | 4.500000e+05 | 0.018850 | -15750.000000 | -1213.000000 | -4504.000000 | -3254.000000 | 9.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 2.000000 | 2.000000 | 2.000000 | 12.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.505998 | 5.659614e-01 | 0.535276 | 0.08760 | 0.076300 | 0.981600 | 0.755200 | 0.021100 | 0.000000 | 0.137900 | 0.166700 | 0.208300 | 0.048100 | 0.075600 | 0.074500 | 0.000000 | 0.003600 | 0.084000 | 0.074600 | 0.981600 | 0.764800 | 0.019000 | 0.000000 | 0.137900 | 0.166700 | 0.208300 | 0.045800 | 0.077100 | 0.073100 | 0.000000 | 0.001100 | 0.086400 | 0.075800 | 0.981600 | 0.758500 | 0.020800 | 0.000000 | 0.137900 | 0.166700 | 0.208300 | 0.048700 | 0.076100 | 0.074900 | 0.000000 | 0.003100 | 0.068800 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -757.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 75% | 0.000000 | 1.000000 | 2.025000e+05 | 8.086500e+05 | 34596.000000 | 6.795000e+05 | 0.028663 | -12413.000000 | -289.000000 | -2010.000000 | -1720.000000 | 15.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 3.000000 | 2.000000 | 2.000000 | 14.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.675053 | 6.636171e-01 | 0.669057 | 0.14850 | 0.112200 | 0.986600 | 0.823200 | 0.051500 | 0.120000 | 0.206900 | 0.333300 | 0.375000 | 0.085600 | 0.121000 | 0.129900 | 0.003900 | 0.027700 | 0.143900 | 0.112400 | 0.986600 | 0.823600 | 0.049000 | 0.120800 | 0.206900 | 0.333300 | 0.375000 | 0.084100 | 0.131300 | 0.125200 | 0.003900 | 0.023100 | 0.148900 | 0.111600 | 0.986600 | 0.825600 | 0.051300 | 0.120000 | 0.206900 | 0.333300 | 0.375000 | 0.086800 | 0.123100 | 0.130300 | 0.003900 | 0.026600 | 0.127600 | 2.000000 | 0.000000 | 2.000000 | 0.000000 | -274.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 |
| max | 1.000000 | 19.000000 | 1.170000e+08 | 4.050000e+06 | 258025.500000 | 4.050000e+06 | 0.072508 | -7489.000000 | 365243.000000 | 0.000000 | 0.000000 | 91.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 20.000000 | 3.000000 | 3.000000 | 23.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.962693 | 8.549997e-01 | 0.896010 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 348.000000 | 34.000000 | 344.000000 | 24.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 4.000000 | 9.000000 | 8.000000 | 27.000000 | 261.000000 | 25.000000 |
Too high values:
Vars with outliers :
* DAYS_EMPLOYED
* AMT_GOODS_PRICE
* DAYS_REGISTRATION
* DAYS_LAST_PHONE_CHANGE
data['DAYS_EMPLOYED'].describe()
count 307511.000000 mean 63815.045904 std 141275.766519 min -17912.000000 25% -2760.000000 50% -1213.000000 75% -289.000000 max 365243.000000 Name: DAYS_EMPLOYED, dtype: float64
data['DAYS_EMPLOYED'].max()//365
1000
Le maximum, en plus d'être positif, représente presque 1000 ans
Y a t-il une relation entre ces valeurs aberrantes et les defauts de payment?
C'est le cas
Solution :
# remove this because 365243 is an outlier
data['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
col = 'DAYS_EMPLOYED'
plt.figure(figsize = (8, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 0, col ], color='dodgerblue', label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 1, col], color='red', label = 'target == 1')
# Labeling of plot
plt.legend(['Repayed','Default'])
plt.xlabel(col); plt.ylabel('Density'); plt.title(f'Distribution of {col}');
data['AMT_GOODS_PRICE'].describe()
count 3.072330e+05 mean 5.383962e+05 std 3.694465e+05 min 4.050000e+04 25% 2.385000e+05 50% 4.500000e+05 75% 6.795000e+05 max 4.050000e+06 Name: AMT_GOODS_PRICE, dtype: float64
Pas de valeur aberrante
# Plot the distribution of prices
plt.hist(data['AMT_GOODS_PRICE'], edgecolor = 'k', bins = 100)
plt.title('Prices of goods'); plt.xlabel('price'); plt.ylabel('Count');
col = 'AMT_GOODS_PRICE'
plt.figure(figsize = (8, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 0, col ], color='dodgerblue', label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 1, col], color='red', label = 'target == 1')
# Labeling of plot
plt.legend(['Repayed','Default'])
plt.xlabel(col); plt.ylabel('Density'); plt.title(f'Distribution of {col}');
(data['DAYS_REGISTRATION'] / -365).describe()
count 307511.000000 mean 13.660604 std 9.651743 min -0.000000 25% 5.506849 50% 12.339726 75% 20.491781 max 67.594521 Name: DAYS_REGISTRATION, dtype: float64
# Plot the distribution of years since registration
plt.hist(data['DAYS_REGISTRATION'] / -365, edgecolor = 'k', bins = 25)
plt.title('Age of Registration'); plt.xlabel('Age (years)'); plt.ylabel('Count');
col = 'DAYS_REGISTRATION'
plt.figure(figsize = (8, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 0, col ], color='dodgerblue', label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 1, col], color='red', label = 'target == 1')
# Labeling of plot
plt.legend(['Repayed','Default'])
plt.xlabel(col); plt.ylabel('Density'); plt.title(f'Distribution of {col}');
(data['DAYS_LAST_PHONE_CHANGE'] / -365).describe()
count 307510.000000 mean 2.637969 std 2.265229 min -0.000000 25% 0.750685 50% 2.073973 75% 4.301370 max 11.758904 Name: DAYS_LAST_PHONE_CHANGE, dtype: float64
# Plot the distribution of years since phone change
plt.hist(data['DAYS_LAST_PHONE_CHANGE'] / -365, edgecolor = 'k', bins = 25)
plt.title('Age of phone'); plt.xlabel('Age (years)'); plt.ylabel('Count');
col = 'DAYS_LAST_PHONE_CHANGE'
plt.figure(figsize = (8, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 0, col ], color='dodgerblue', label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 1, col], color='red', label = 'target == 1')
# Labeling of plot
plt.legend(['Repayed','Default'])
plt.xlabel(col); plt.ylabel('Density'); plt.title(f'Distribution of {col}');
vars_names = data.columns.tolist()
vars_names.remove('TARGET')
flag_cols = [n for n in vars_names if n.startswith('FLAG_')]
flag_cols
['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
# look correlations with TARGET
flag = data[flag_cols]
flag['TARGET'] = data.TARGET
flag_corr = flag.corr()['TARGET'].sort_values()
flag_corr
FLAG_DOCUMENT_6 -0.028602 FLAG_PHONE -0.023806 FLAG_DOCUMENT_16 -0.011615 FLAG_DOCUMENT_13 -0.011583 FLAG_DOCUMENT_14 -0.009464 FLAG_DOCUMENT_8 -0.008040 FLAG_DOCUMENT_18 -0.007952 FLAG_DOCUMENT_15 -0.006536 FLAG_DOCUMENT_9 -0.004352 FLAG_DOCUMENT_11 -0.004229 FLAG_DOCUMENT_17 -0.003378 FLAG_DOCUMENT_4 -0.002672 FLAG_EMAIL -0.001758 FLAG_DOCUMENT_7 -0.001520 FLAG_DOCUMENT_10 -0.001414 FLAG_DOCUMENT_19 -0.001358 FLAG_DOCUMENT_12 -0.000756 FLAG_DOCUMENT_5 -0.000316 FLAG_DOCUMENT_20 0.000215 FLAG_CONT_MOBILE 0.000370 FLAG_MOBIL 0.000534 FLAG_DOCUMENT_21 0.003709 FLAG_DOCUMENT_2 0.005417 FLAG_WORK_PHONE 0.028524 FLAG_DOCUMENT_3 0.044346 FLAG_EMP_PHONE 0.045982 TARGET 1.000000 Name: TARGET, dtype: float64
for col in list(flag_corr[abs(flag_corr) >= 0.01].index):
if col != 'TARGET':
plt.figure(figsize = (8, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 0, col ], color='dodgerblue', label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 1, col], color='red', label = 'target == 1')
# Labeling of plot
plt.legend(['Repayed','Default'])
plt.xlabel(col); plt.ylabel('Density'); plt.title(f'Distribution of {col}');
region_cols = [n for n in vars_names if n.startswith('REG')]
# region_cols.extend(['LIVE_REGION_NOT_WORK_REGION','LIVE_CITY_NOT_WORK_CITY'])
# region_cols.extend(['LIVE_CITY_NOT_WORK_CITY'])
region_cols
['REGION_POPULATION_RELATIVE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY']
region = data[region_cols]
region['TARGET'] = data.TARGET
region_corr = region.corr()['TARGET'].sort_values()
region_corr
REGION_POPULATION_RELATIVE -0.037227 REG_REGION_NOT_LIVE_REGION 0.005576 REG_REGION_NOT_WORK_REGION 0.006942 REG_CITY_NOT_LIVE_CITY 0.044395 REG_CITY_NOT_WORK_CITY 0.050994 REGION_RATING_CLIENT 0.058899 REGION_RATING_CLIENT_W_CITY 0.060893 TARGET 1.000000 Name: TARGET, dtype: float64
for col in list(region_corr[abs(region_corr) >= 0.01].index):
if col != 'TARGET':
plt.figure(figsize = (8, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 0, col ], color='dodgerblue', label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 1, col], color='red', label = 'target == 1')
# Labeling of plot
plt.legend(['Repayed','Default'])
plt.xlabel(col); plt.ylabel('Density'); plt.title(f'Distribution of {col}');
# Set the style of plots
# plt.style.use('fivethirtyeight') # <----- Trouver un style plus sympa
# Plot the distribution of ages in years
plt.hist(data['EXT_SOURCE_1'], edgecolor = 'k', bins = 25)
plt.title('EXT_SOURCE_1'); plt.xlabel(''); plt.ylabel('Count');
plt.figure(figsize = (8, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 0, 'EXT_SOURCE_1'], color='dodgerblue', label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 1, 'EXT_SOURCE_1'], color='red', label = 'target == 1')
# Labeling of plot
plt.legend(['Repayed','Default'])
plt.xlabel('EXT_SOURCE_1'); plt.ylabel('Density'); plt.title('Distribution of EXT_SOURCE_1');
# Set the style of plots
# plt.style.use('fivethirtyeight') # <----- Trouver un style plus sympa
# Plot the distribution of ages in years
plt.hist(data['EXT_SOURCE_2'], edgecolor = 'k', bins = 25)
plt.title('EXT_SOURCE_2'); plt.xlabel(''); plt.ylabel('Count');
plt.figure(figsize = (8, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 0, 'EXT_SOURCE_2'], color='dodgerblue', label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(data.loc[data['TARGET'] == 1, 'EXT_SOURCE_2'], color='red', label = 'target == 1')
# Labeling of plot
plt.legend(['Repayed','Default'])
plt.xlabel('EXT_SOURCE_2'); plt.ylabel('Density'); plt.title('Distribution of EXT_SOURCE_2');
# Set the style of plots
# plt.style.use('fivethirtyeight') # <----- Trouver un style plus sympa
# Plot the distribution of ages in years
plt.hist(data['EXT_SOURCE_3'], edgecolor = 'k', bins = 25)
plt.title('EXT_SOURCE_3'); plt.xlabel(''); plt.ylabel('Count');
def vars_vs_target(df,cols):
for col in cols:
plt.figure(figsize = (8, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 0, col ], color='dodgerblue', label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 1, col], color='red', label = 'target == 1')
# Labeling of plot
plt.legend(['Repayed','Default'])
plt.xlabel(col); plt.ylabel('Density'); plt.title(f'Distribution of {col}');
vars_vs_target(data, ['EXT_SOURCE_3'])
for col in data.select_dtypes('object'):
print ('>',col,'\n')
print(data[col].value_counts(),'\n')
> NAME_CONTRACT_TYPE Cash loans 278232 Revolving loans 29279 Name: NAME_CONTRACT_TYPE, dtype: int64 > CODE_GENDER F 202448 M 105059 XNA 4 Name: CODE_GENDER, dtype: int64 > FLAG_OWN_CAR N 202924 Y 104587 Name: FLAG_OWN_CAR, dtype: int64 > FLAG_OWN_REALTY Y 213312 N 94199 Name: FLAG_OWN_REALTY, dtype: int64 > NAME_TYPE_SUITE Unaccompanied 248526 Family 40149 Spouse, partner 11370 Children 3267 Other_B 1770 Other_A 866 Group of people 271 Name: NAME_TYPE_SUITE, dtype: int64 > NAME_INCOME_TYPE Working 158774 Commercial associate 71617 Pensioner 55362 State servant 21703 Unemployed 22 Student 18 Businessman 10 Maternity leave 5 Name: NAME_INCOME_TYPE, dtype: int64 > NAME_EDUCATION_TYPE Secondary / secondary special 218391 Higher education 74863 Incomplete higher 10277 Lower secondary 3816 Academic degree 164 Name: NAME_EDUCATION_TYPE, dtype: int64 > NAME_FAMILY_STATUS Married 196432 Single / not married 45444 Civil marriage 29775 Separated 19770 Widow 16088 Unknown 2 Name: NAME_FAMILY_STATUS, dtype: int64 > NAME_HOUSING_TYPE House / apartment 272868 With parents 14840 Municipal apartment 11183 Rented apartment 4881 Office apartment 2617 Co-op apartment 1122 Name: NAME_HOUSING_TYPE, dtype: int64 > OCCUPATION_TYPE Laborers 55186 Sales staff 32102 Core staff 27570 Managers 21371 Drivers 18603 High skill tech staff 11380 Accountants 9813 Medicine staff 8537 Security staff 6721 Cooking staff 5946 Cleaning staff 4653 Private service staff 2652 Low-skill Laborers 2093 Waiters/barmen staff 1348 Secretaries 1305 Realty agents 751 HR staff 563 IT staff 526 Name: OCCUPATION_TYPE, dtype: int64 > WEEKDAY_APPR_PROCESS_START TUESDAY 53901 WEDNESDAY 51934 MONDAY 50714 THURSDAY 50591 FRIDAY 50338 SATURDAY 33852 SUNDAY 16181 Name: WEEKDAY_APPR_PROCESS_START, dtype: int64 > ORGANIZATION_TYPE Business Entity Type 3 67992 XNA 55374 Self-employed 38412 Other 16683 Medicine 11193 Business Entity Type 2 10553 Government 10404 School 8893 Trade: type 7 7831 Kindergarten 6880 Construction 6721 Business Entity Type 1 5984 Transport: type 4 5398 Trade: type 3 3492 Industry: type 9 3368 Industry: type 3 3278 Security 3247 Housing 2958 Industry: type 11 2704 Military 2634 Bank 2507 Agriculture 2454 Police 2341 Transport: type 2 2204 Postal 2157 Security Ministries 1974 Trade: type 2 1900 Restaurant 1811 Services 1575 University 1327 Industry: type 7 1307 Transport: type 3 1187 Industry: type 1 1039 Hotel 966 Electricity 950 Industry: type 4 877 Trade: type 6 631 Industry: type 5 599 Insurance 597 Telecom 577 Emergency 560 Industry: type 2 458 Advertising 429 Realtor 396 Culture 379 Industry: type 12 369 Trade: type 1 348 Mobile 317 Legal Services 305 Cleaning 260 Transport: type 1 201 Industry: type 6 112 Industry: type 10 109 Religion 85 Industry: type 13 67 Trade: type 4 64 Trade: type 5 49 Industry: type 8 24 Name: ORGANIZATION_TYPE, dtype: int64 > FONDKAPREMONT_MODE reg oper account 73830 reg oper spec account 12080 not specified 5687 org spec account 5619 Name: FONDKAPREMONT_MODE, dtype: int64 > HOUSETYPE_MODE block of flats 150503 specific housing 1499 terraced house 1212 Name: HOUSETYPE_MODE, dtype: int64 > WALLSMATERIAL_MODE Panel 66040 Stone, brick 64815 Block 9253 Wooden 5362 Mixed 2296 Monolithic 1779 Others 1625 Name: WALLSMATERIAL_MODE, dtype: int64 > EMERGENCYSTATE_MODE No 159428 Yes 2328 Name: EMERGENCYSTATE_MODE, dtype: int64
data = data[data['CODE_GENDER']!='XNA']
data.select_dtypes('object').describe()
| NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | OCCUPATION_TYPE | WEEKDAY_APPR_PROCESS_START | ORGANIZATION_TYPE | FONDKAPREMONT_MODE | HOUSETYPE_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 307507 | 307507 | 307507 | 307507 | 306215 | 307507 | 307507 | 307507 | 307507 | 211118 | 307507 | 307507 | 97214 | 153211 | 151167 | 161753 |
| unique | 2 | 2 | 2 | 2 | 7 | 8 | 5 | 6 | 6 | 18 | 7 | 58 | 4 | 3 | 7 | 2 |
| top | Cash loans | F | N | Y | Unaccompanied | Working | Secondary / secondary special | Married | House / apartment | Laborers | TUESDAY | Business Entity Type 3 | reg oper account | block of flats | Panel | No |
| freq | 278232 | 202448 | 202922 | 213308 | 248523 | 158771 | 218389 | 196429 | 272865 | 55186 | 53900 | 67992 | 73829 | 150500 | 66039 | 159425 |
def compute_stacked(df,col):
total = df_cat[[col,'TARGET']].groupby(col).count().reset_index().rename({'TARGET':'total'}, axis=1)
cats = df_cat[[col,'TARGET']].value_counts().reset_index().rename({0:'count'}, axis=1).sort_values(['TARGET',col])
diff = set(cats[cats.TARGET == 'Default'][col]).symmetric_difference(set(cats[cats.TARGET == 'Repayed'][col]))
cats.drop(cats[cats[col].isin(diff) == True].index.values, axis=0, inplace=True)
total.drop(total[total[col].isin(diff) == True].index.values, axis=0, inplace=True)
cats['total'] = total.total.to_list()*2
cats['percent'] = cats['count'] / cats.total
names = cats[col].unique()
# plot repaid loans
bar1 = sns.barplot( x=[1]*len(names), y= names, orient='h',palette=['red','red'])
bar1 = sns.barplot( data=cats[cats.TARGET=='Repayed'], x='percent',y=col, orient='h',palette=['dodgerblue','dodgerblue'])
# plt.legend(labels=['Repayed','Default'], labelcolor=['green','red'],loc='center')
# Label the plots
plt.title('Modalities of %s by Target Value' % col)
plt.xlabel('%s' % col); plt.ylabel('Density');
return diff
plt.figure(figsize = (16, 20))
# iterate through the new features
df_cat = data.select_dtypes('object').join(data.TARGET.replace({0:'Repayed',1:'Default'}))
var_with_diff={}
for i, col in enumerate(df_cat.columns):
if col != 'TARGET':
# create a new subplot for each source
plt.subplot(6, 3, i + 1)
diff = compute_stacked(df_cat,col)
if len(diff)>0 : var_with_diff[col]=diff
plt.tight_layout(h_pad = 3.5)
for key in var_with_diff.keys():
print(' Found difference in var {} between Default and Repay on modalities {}'.format(key, var_with_diff[key]))
Found difference in var NAME_INCOME_TYPE between Default and Repay on modalities {'Businessman', 'Student'}
Found difference in var NAME_FAMILY_STATUS between Default and Repay on modalities {'Unknown'}
# zoom on 'ORGANIZATION_TYPE'
plt.subplots(1,1,figsize=(8,15))
_ = compute_stacked(df_cat,'ORGANIZATION_TYPE')
Features crées apres documentation et synthèse de kernels kaggle. Liste non exhaustive
Creating features found on different Kaggle Home Credit kernels
# Function to process previous steps of eda on df and create some new features
# Returns 2 data frames :original cleaned datas and new features
def process_data(df):
# Original features
original_features = list(df.columns)
## Incorrect values
# remove this because 365243 is an outlier
df["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)
# XNA value doesn't mean any thing so it is removed from train data
# df=df[df['CODE_GENDER']!='XNA']
# Outliers
# There is an outlier in the train data where AMT_INCOME_TOTAL
# of a person having highest income had difficulty in paying loan.
# This variable was not retained from first filter step but is use
# for feature ingeneering
df=df[df['AMT_INCOME_TOTAL']<(0.2*1e8)]
# Creating features
# Total number of flagged documents
df['DOCUMENT_COUNT']=(df[['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']]==1).sum(axis=1)
# Sum of AMT_REQ_CREDIT_BUREAU_*
df['AMT_REQ_CREDIT_BUREAU_HDWMQY']=(df[['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR']]).sum(axis=1)
# Percentage of customer life spent working
df['DAYS_WORKING_PER']=df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']#
# Whole life days of unemployement
df['DAYS_UNEMPLOYED']=abs(df['DAYS_BIRTH'])-abs(df['DAYS_EMPLOYED'])
# Percent AMT_INCOME_TOTAL on AMT_GOODS_PRICE
df['GOODS_PRICE_INCOME_TOTAL_PER']=df['AMT_INCOME_TOTAL']/df['AMT_GOODS_PRICE']
# Percent AMT_CREDIT on AMT_GOODS_PRICE
df['GOODS_PRICE_CREDIT_PER']=df['AMT_CREDIT']/df['AMT_GOODS_PRICE']#
# Percent AMT_ANNUITY on AMT_GOODS_PRICE
df['GOODS_PRICE_AMT_ANNUITY_PER']=df['AMT_ANNUITY']/df['AMT_GOODS_PRICE']
# Percent DAYS_EMPLOYED on AMT_GOODS_PRICE
df['GOODS_PRICE_EMP']=abs(df['DAYS_EMPLOYED'])/df['AMT_GOODS_PRICE']#
# Percent AMT_CREDIT on AGE
df['AMT_CREDIT_BIRTH']=df['AMT_CREDIT']/abs(df['DAYS_BIRTH']/365)#
# Percent AMT_INCOME_TOTAL on AMT_CREDIT
df['INCOME_CREDIT_PER'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
# Mean income in family
df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / (df['CNT_FAM_MEMBERS']+1)
# Rest to live per person after repayment
df['REST_TO_LIVE'] = (df['AMT_INCOME_TOTAL'] - df['AMT_ANNUITY']) / df['CNT_FAM_MEMBERS']
# Percent of day employed on AMT_ANNUITY
df['ANNUITY_DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED']/ df['AMT_ANNUITY']#
# Percent of day employed on AMT_CREDIT
df['AMT_CREDIT_DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED']/ df['AMT_CREDIT']#
# Amount paid for loan application every month decided by the number of day lived
df['ANNUITY_DAYS_BIRTH_PERC'] = df['DAYS_BIRTH']/ df['AMT_ANNUITY']
# Anually paid amount to amount credited
df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
df['PAYMENT_RATE_INV'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
df['PAY_TOWARDS_LOAN'] = df['AMT_INCOME_TOTAL']-df['AMT_ANNUITY']
# Mean *_CNT_SOCIAL_CIRCLE
df['MEAN_DEFAULT_SURR']=((df[['OBS_30_CNT_SOCIAL_CIRCLE',
'DEF_30_CNT_SOCIAL_CIRCLE',
'OBS_60_CNT_SOCIAL_CIRCLE',
'DEF_60_CNT_SOCIAL_CIRCLE'
]]).sum(axis=1))//4
# Sum of adress like vars
df['ADDRESS_MISSMATCH']=((df[['REG_REGION_NOT_LIVE_REGION',
'REG_REGION_NOT_WORK_REGION',
'LIVE_REGION_NOT_WORK_REGION',
'REG_CITY_NOT_LIVE_CITY',
'REG_CITY_NOT_WORK_CITY',
'LIVE_CITY_NOT_WORK_CITY']]).sum(axis=1))
# Mean of enquiries
df['MEAN_ENQUIRIES']=((df[['AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_WEEK',
'AMT_REQ_CREDIT_BUREAU_MON',
'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR'
]]).mean(axis=1))
# Sum of contact flag vars
df['CONTACT_REF']=((df[['FLAG_MOBIL',
'FLAG_EMP_PHONE',
'FLAG_WORK_PHONE',
'FLAG_CONT_MOBILE',
'FLAG_PHONE',
'FLAG_EMAIL'
]]).sum(axis=1))
# Mas of days vars
df['MAX_DAYS_SOMETHING_CHANGED']=((df[['DAYS_EMPLOYED', 'DAYS_ID_PUBLISH',#
'DAYS_REGISTRATION']]).max(axis=1))
new_features = [col for col in list(df.columns) if (not(col in original_features))]
# return df.loc[:,original_features], df.loc[:,new_features]
return df.loc[:,most_corr_less_miss_cols], df.loc[:,new_features]
# Apply previous cleaning steps and create new features on application_train numerical vars
selected_numerical_app_train, domain_features = process_data(app_train.select_dtypes('number'))
selected_numerical_app_train.dtypes.value_counts()
float64 11 int64 9 dtype: int64
new_features_summary = numerical_summary(domain_features.join(selected_numerical_app_train.TARGET))
# qgrid.QgridWidget(df=new_features_summary)
new_features_summary
Data Frame a 24 colonnes. Dont 24 colonnes contiennent des valeurs manquantes.
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| DAYS_WORKING_PER | -0.07 | 55374 | 18.01 |
| GOODS_PRICE_EMP | -0.05 | 55630 | 18.09 |
| PAYMENT_RATE_INV | -0.03 | 12 | 0.00 |
| PAY_TOWARDS_LOAN | -0.02 | 12 | 0.00 |
| INCOME_PER_PERSON | -0.02 | 2 | 0.00 |
| DAYS_UNEMPLOYED | -0.02 | 55374 | 18.01 |
| REST_TO_LIVE | -0.01 | 14 | 0.00 |
| INCOME_CREDIT_PER | -0.01 | 0 | 0.00 |
| AMT_REQ_CREDIT_BUREAU_HDWMQY | -0.00 | 0 | 0.00 |
| AMT_CREDIT_BIRTH | -0.00 | 0 | 0.00 |
| GOODS_PRICE_INCOME_TOTAL_PER | -0.00 | 278 | 0.09 |
| MEAN_ENQUIRIES | 0.01 | 41519 | 13.50 |
| PAYMENT_RATE | 0.01 | 12 | 0.00 |
| MEAN_DEFAULT_SURR | 0.01 | 0 | 0.00 |
| DOCUMENT_COUNT | 0.02 | 0 | 0.00 |
| CONTACT_REF | 0.02 | 0 | 0.00 |
| GOODS_PRICE_AMT_ANNUITY_PER | 0.03 | 290 | 0.09 |
| ANNUITY_DAYS_BIRTH_PERC | 0.04 | 12 | 0.00 |
| ADDRESS_MISSMATCH | 0.04 | 0 | 0.00 |
| AMT_CREDIT_DAYS_EMPLOYED_PERC | 0.05 | 55374 | 18.01 |
| ANNUITY_DAYS_EMPLOYED_PERC | 0.06 | 55386 | 18.01 |
| GOODS_PRICE_CREDIT_PER | 0.07 | 278 | 0.09 |
| MAX_DAYS_SOMETHING_CHANGED | 0.07 | 0 | 0.00 |
| TARGET | 1.00 | 0 | 0.00 |
# Statistics
domain_features.describe()
| DOCUMENT_COUNT | AMT_REQ_CREDIT_BUREAU_HDWMQY | DAYS_WORKING_PER | DAYS_UNEMPLOYED | GOODS_PRICE_INCOME_TOTAL_PER | GOODS_PRICE_CREDIT_PER | GOODS_PRICE_AMT_ANNUITY_PER | GOODS_PRICE_EMP | AMT_CREDIT_BIRTH | INCOME_CREDIT_PER | INCOME_PER_PERSON | REST_TO_LIVE | ANNUITY_DAYS_EMPLOYED_PERC | AMT_CREDIT_DAYS_EMPLOYED_PERC | ANNUITY_DAYS_BIRTH_PERC | PAYMENT_RATE | PAYMENT_RATE_INV | PAY_TOWARDS_LOAN | MEAN_DEFAULT_SURR | ADDRESS_MISSMATCH | MEAN_ENQUIRIES | CONTACT_REF | MAX_DAYS_SOMETHING_CHANGED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 307510.000000 | 307510.000000 | 252136.000000 | 252136.000000 | 307232.000000 | 307232.000000 | 307220.000000 | 251880.000000 | 307510.000000 | 307510.000000 | 3.075080e+05 | 3.074960e+05 | 252124.000000 | 252136.000000 | 307498.000000 | 307498.000000 | 307498.000000 | 3.074980e+05 | 307510.000000 | 307510.000000 | 265991.000000 | 307510.000000 | 307510.000000 |
| mean | 0.930155 | 2.145690 | 0.156862 | 12384.966593 | 0.444008 | 1.122995 | 0.059919 | 0.006752 | 14487.998633 | 0.398995 | 5.738732e+04 | 7.823721e+04 | -0.111381 | -0.006057 | -0.797778 | 0.053695 | 21.612323 | 1.413087e+05 | 0.620985 | 0.594755 | 0.413436 | 3.355175 | -1677.231160 |
| std | 0.344295 | 2.290872 | 0.133549 | 3584.351865 | 0.377256 | 0.124045 | 0.024655 | 0.010384 | 10386.297809 | 0.343265 | 3.956156e+04 | 6.741277e+04 | 0.146103 | 0.009323 | 0.586488 | 0.022481 | 7.823836 | 1.035966e+05 | 1.198462 | 1.084639 | 0.381389 | 0.865289 | 1417.364822 |
| min | 0.000000 | 0.000000 | -0.000000 | 6544.000000 | 0.011801 | 0.150000 | 0.007500 | 0.000000 | 660.062691 | 0.011801 | 2.647059e+03 | -1.283850e+04 | -5.023094 | -0.256178 | -14.539771 | 0.022073 | 8.036674 | -2.567700e+04 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | -6337.000000 |
| 25% | 1.000000 | 0.000000 | 0.056099 | 9430.000000 | 0.216667 | 1.000000 | 0.042310 | 0.001580 | 6857.833336 | 0.193803 | 3.375000e+04 | 3.867450e+04 | -0.135635 | -0.006947 | -0.979850 | 0.036900 | 15.614496 | 8.544150e+04 | 0.000000 | 0.000000 | 0.166667 | 3.000000 | -2581.000000 |
| 50% | 1.000000 | 2.000000 | 0.118734 | 11952.000000 | 0.340000 | 1.118800 | 0.051380 | 0.003607 | 11969.336027 | 0.306272 | 4.950000e+04 | 6.122700e+04 | -0.065883 | -0.003236 | -0.626722 | 0.050000 | 20.000000 | 1.210770e+05 | 0.000000 | 0.000000 | 0.333333 | 3.000000 | -1202.000000 |
| 75% | 1.000000 | 3.000000 | 0.219170 | 14950.000000 | 0.543478 | 1.198000 | 0.072120 | 0.007733 | 19439.830851 | 0.495376 | 6.750000e+04 | 9.774675e+04 | -0.029378 | -0.001410 | -0.423730 | 0.064043 | 27.099985 | 1.732500e+05 | 1.000000 | 2.000000 | 0.666667 | 4.000000 | -516.000000 |
| max | 4.000000 | 262.000000 | 0.728811 | 24897.000000 | 26.666800 | 6.000000 | 0.300000 | 0.276867 | 130184.940555 | 26.666800 | 4.500000e+06 | 6.684527e+06 | 0.000000 | 0.000000 | -0.043849 | 0.124430 | 45.305079 | 1.793079e+07 | 187.000000 | 6.000000 | 43.666667 | 6.000000 | 0.000000 |
domain_features[domain_features.REST_TO_LIVE <0]
| DOCUMENT_COUNT | AMT_REQ_CREDIT_BUREAU_HDWMQY | DAYS_WORKING_PER | DAYS_UNEMPLOYED | GOODS_PRICE_INCOME_TOTAL_PER | GOODS_PRICE_CREDIT_PER | GOODS_PRICE_AMT_ANNUITY_PER | GOODS_PRICE_EMP | AMT_CREDIT_BIRTH | INCOME_CREDIT_PER | INCOME_PER_PERSON | REST_TO_LIVE | ANNUITY_DAYS_EMPLOYED_PERC | AMT_CREDIT_DAYS_EMPLOYED_PERC | ANNUITY_DAYS_BIRTH_PERC | PAYMENT_RATE | PAYMENT_RATE_INV | PAY_TOWARDS_LOAN | MEAN_DEFAULT_SURR | ADDRESS_MISSMATCH | MEAN_ENQUIRIES | CONTACT_REF | MAX_DAYS_SOMETHING_CHANGED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||||
| 100784 | 0 | 0.0 | NaN | NaN | 0.028640 | 1.000000 | 0.039348 | NaN | 31026.892385 | 0.028640 | 18000.0 | -10095.750 | NaN | NaN | -0.298970 | 0.039348 | 25.413963 | -20191.5 | 3.0 | 0 | NaN | 2 | -4714.0 |
| 124157 | 0 | 0.0 | NaN | NaN | 0.011801 | 1.000000 | 0.022139 | NaN | 36869.800623 | 0.011801 | 8550.0 | -11234.250 | NaN | NaN | -0.447167 | 0.022139 | 45.169737 | -22468.5 | 1.0 | 0 | NaN | 2 | -1669.0 |
| 129999 | 1 | 0.0 | NaN | NaN | 0.051160 | 1.138600 | 0.058280 | NaN | 18564.130435 | 0.044932 | 15348.0 | -3204.000 | NaN | NaN | -0.384123 | 0.051186 | 19.536719 | -6408.0 | 1.0 | 0 | NaN | 3 | -2417.0 |
| 141454 | 1 | 3.0 | NaN | NaN | 0.022727 | 1.118800 | 0.029580 | NaN | 36306.994163 | 0.020314 | 15000.0 | -6783.750 | NaN | NaN | -0.380245 | 0.026439 | 37.823434 | -13567.5 | 0.0 | 0 | 0.500000 | 3 | -5325.0 |
| 156942 | 1 | 1.0 | NaN | NaN | 0.050000 | 1.237600 | 0.054707 | NaN | 16702.108896 | 0.040401 | 11250.0 | -1588.500 | NaN | NaN | -0.494381 | 0.044204 | 22.622471 | -3177.0 | 0.0 | 0 | 0.166667 | 3 | -1792.0 |
| 167672 | 1 | 3.0 | 0.103260 | 8528.0 | 0.034286 | 1.095040 | 0.038146 | 0.000623 | 66194.649842 | 0.031310 | 18000.0 | -3039.750 | -0.016345 | -0.000569 | -0.158290 | 0.034835 | 28.706764 | -6079.5 | 0.0 | 0 | 0.500000 | 3 | -982.0 |
| 174612 | 1 | 2.0 | NaN | NaN | 0.040000 | 1.166320 | 0.047600 | NaN | 28032.982781 | 0.034296 | 18000.0 | -5130.000 | NaN | NaN | -0.319032 | 0.040812 | 24.502521 | -10260.0 | 0.0 | 0 | 0.333333 | 2 | -1823.0 |
| 179800 | 1 | 0.0 | 0.212659 | 14828.0 | 0.046667 | 1.118800 | 0.047593 | 0.005933 | 14636.242234 | 0.041711 | 10500.0 | -312.750 | -0.124667 | -0.005303 | -0.586232 | 0.042540 | 23.507494 | -625.5 | 1.0 | 4 | 0.000000 | 3 | -2077.0 |
| 210988 | 2 | 4.0 | NaN | NaN | 0.042500 | 1.118800 | 0.045640 | NaN | 34139.222516 | 0.037987 | 25500.0 | -2826.000 | NaN | NaN | -0.262087 | 0.040794 | 24.513585 | -5652.0 | 1.0 | 0 | 0.666667 | 3 | -1035.0 |
| 230424 | 1 | 0.0 | NaN | NaN | 0.089109 | 1.039604 | 0.098208 | NaN | 7424.128282 | 0.085714 | 13500.0 | -2067.750 | NaN | NaN | -0.520438 | 0.094467 | 10.585745 | -4135.5 | 0.0 | 0 | 0.000000 | 2 | -4976.0 |
| 234376 | 1 | 2.0 | 0.204762 | 10354.0 | 0.088235 | 1.000000 | 0.094947 | 0.003485 | 21445.852535 | 0.088235 | 22500.0 | -2567.250 | -0.036704 | -0.003485 | -0.179254 | 0.094947 | 10.532185 | -5134.5 | 0.0 | 0 | 0.333333 | 4 | -2302.0 |
| 239922 | 1 | 3.0 | 0.208493 | 8276.0 | 0.083333 | 1.171600 | 0.085447 | 0.003230 | 27606.441278 | 0.071128 | 14062.5 | -475.500 | -0.037797 | -0.002757 | -0.181287 | 0.072932 | 13.711477 | -1426.5 | 0.0 | 2 | 0.500000 | 3 | -1306.0 |
| 242003 | 1 | 3.0 | 0.104724 | 10045.0 | 0.120000 | 1.105600 | 0.131500 | 0.002611 | 16184.919786 | 0.108538 | 18000.0 | -2587.500 | -0.019856 | -0.002362 | -0.189607 | 0.118940 | 8.407605 | -5175.0 | 2.0 | 0 | 0.500000 | 3 | -1175.0 |
| 244509 | 1 | 2.0 | NaN | NaN | 0.048000 | 1.166320 | 0.049532 | NaN | 21928.578297 | 0.041155 | 18000.0 | -861.750 | NaN | NaN | -0.391935 | 0.042469 | 23.546798 | -1723.5 | 0.0 | 0 | 0.333333 | 3 | -223.0 |
| 268238 | 1 | 1.0 | NaN | NaN | 0.055000 | 1.138600 | 0.061765 | NaN | 17565.046492 | 0.048305 | 16500.0 | -3044.250 | NaN | NaN | -0.383065 | 0.054246 | 18.434388 | -6088.5 | 1.0 | 0 | 0.166667 | 2 | -4488.0 |
| 283247 | 1 | 4.0 | NaN | NaN | 0.036000 | 1.277200 | 0.037460 | NaN | 22250.753076 | 0.028187 | 13500.0 | -821.250 | NaN | NaN | -0.559293 | 0.029330 | 34.095035 | -1642.5 | 0.0 | 0 | 0.666667 | 2 | -2368.0 |
| 293005 | 1 | 0.0 | NaN | NaN | 0.043333 | 1.118800 | 0.049473 | NaN | 12790.327595 | 0.038732 | 9750.0 | -2072.250 | NaN | NaN | -0.645346 | 0.044220 | 22.614203 | -4144.5 | 0.0 | 0 | 0.000000 | 2 | -4485.0 |
| 295032 | 1 | 0.0 | NaN | NaN | 0.037500 | 1.158400 | 0.040337 | NaN | 38709.567163 | 0.032372 | 22500.0 | -2553.750 | NaN | NaN | -0.270785 | 0.034822 | 28.717694 | -5107.5 | 0.0 | 0 | NaN | 2 | -3123.0 |
| 300643 | 1 | 1.0 | NaN | NaN | 0.099338 | 1.039603 | 0.101464 | NaN | 11161.406433 | 0.095554 | 22500.0 | -722.250 | NaN | NaN | -0.335067 | 0.097598 | 10.246067 | -1444.5 | 0.0 | 0 | 0.166667 | 2 | -4148.0 |
| 301115 | 1 | 2.0 | NaN | NaN | 0.039735 | 1.118801 | 0.047589 | NaN | 13262.070807 | 0.035516 | 9000.0 | -2668.500 | NaN | NaN | -0.647030 | 0.042536 | 23.509463 | -5337.0 | 0.0 | 0 | 0.333333 | 2 | -2281.0 |
| 329192 | 1 | 5.0 | NaN | NaN | 0.065359 | 1.079203 | 0.102654 | NaN | 11747.144064 | 0.060563 | 15000.0 | -12838.500 | NaN | NaN | -0.326655 | 0.095120 | 10.513052 | -25677.0 | 0.0 | 0 | 0.833333 | 3 | -4289.0 |
| 332872 | 1 | 2.0 | 0.189349 | 13730.0 | 0.036000 | 1.087120 | 0.046176 | 0.002851 | 26356.417902 | 0.033115 | 10125.0 | -3816.000 | -0.061735 | -0.002622 | -0.326038 | 0.042476 | 23.542966 | -11448.0 | 0.0 | 0 | 0.333333 | 3 | -473.0 |
| 338291 | 1 | 0.0 | NaN | NaN | 0.032000 | 1.087120 | 0.043224 | NaN | 19391.774544 | 0.029436 | 12000.0 | -6313.500 | NaN | NaN | -0.473400 | 0.039760 | 25.150842 | -12627.0 | 0.0 | 0 | NaN | 2 | -4958.0 |
| 346212 | 1 | 3.0 | 0.016274 | 16200.0 | 0.041667 | 1.000000 | 0.042487 | 0.000248 | 23937.333009 | 0.041667 | 15000.0 | -443.250 | -0.005840 | -0.000248 | -0.358886 | 0.042487 | 23.536334 | -886.5 | 0.0 | 0 | 0.500000 | 4 | -9.0 |
| 346685 | 1 | 3.0 | 0.377704 | 12342.0 | 0.040000 | 1.118800 | 0.047400 | 0.006659 | 23163.780064 | 0.035753 | 15000.0 | -4162.500 | -0.140478 | -0.005952 | -0.371927 | 0.042367 | 23.603376 | -8325.0 | 0.0 | 2 | 0.500000 | 3 | -3111.0 |
| 361920 | 1 | 3.0 | NaN | NaN | 0.025000 | 1.000000 | 0.034833 | NaN | 27743.760821 | 0.025000 | 15000.0 | -8849.250 | NaN | NaN | -0.377696 | 0.034833 | 28.708821 | -17698.5 | 4.0 | 0 | 0.500000 | 2 | -3102.0 |
| 364354 | 1 | 0.0 | 0.123457 | 11147.0 | 0.100000 | 1.052800 | 0.104130 | 0.003489 | 13597.735315 | 0.094985 | 9000.0 | -464.625 | -0.033505 | -0.003314 | -0.271392 | 0.098908 | 10.110439 | -1858.5 | 0.0 | 2 | NaN | 5 | -1570.0 |
| 364989 | 1 | 2.0 | NaN | NaN | 0.093333 | 1.039600 | 0.101273 | NaN | 11382.608213 | 0.089778 | 21000.0 | -2679.750 | NaN | NaN | -0.329172 | 0.097416 | 10.265289 | -5359.5 | 0.0 | 0 | 0.333333 | 2 | -1911.0 |
| 386520 | 1 | 2.0 | 0.099282 | 15922.0 | 0.046358 | 1.000000 | 0.067291 | 0.002583 | 14030.519885 | 0.046358 | 10500.0 | -7112.250 | -0.038382 | -0.002583 | -0.386598 | 0.067291 | 14.860742 | -14224.5 | 0.0 | 0 | 0.333333 | 3 | -1013.0 |
| 398036 | 1 | 0.0 | 0.058873 | 13332.0 | 0.066667 | 1.000000 | 0.069793 | 0.001236 | 17391.994917 | 0.066667 | 15000.0 | -1055.250 | -0.017703 | -0.001236 | -0.300697 | 0.069793 | 14.328016 | -2110.5 | 0.0 | 2 | 0.000000 | 5 | -834.0 |
| 406539 | 1 | 1.0 | 0.202303 | 8521.0 | 0.106250 | 1.105600 | 0.109713 | 0.006003 | 13600.059914 | 0.096102 | 7650.0 | -311.625 | -0.054714 | -0.005429 | -0.270454 | 0.099233 | 10.077247 | -1246.5 | 1.0 | 0 | 0.166667 | 5 | -1249.0 |
| 409321 | 1 | 0.0 | NaN | NaN | 0.060000 | 1.422400 | 0.069470 | NaN | 10622.406111 | 0.042182 | 9000.0 | -2130.750 | NaN | NaN | -0.703549 | 0.048840 | 20.475025 | -4261.5 | 0.0 | 0 | NaN | 3 | -4373.0 |
| 440148 | 1 | 2.0 | NaN | NaN | 0.036850 | 1.118800 | 0.047545 | NaN | 15468.257576 | 0.032937 | 11055.0 | -4812.750 | NaN | NaN | -0.555263 | 0.042496 | 23.531391 | -9625.5 | 2.0 | 0 | 0.333333 | 2 | -371.0 |
domain_features = domain_features[domain_features.REST_TO_LIVE >=0]
frame_vs_target(domain_features,app_train.TARGET,'Domain features')
bureau_data = pd.read_csv('../donnees/bureau.csv')
bureau_data.shape
(1716428, 17)
bureau_data['CREDIT_CURRENCY'].value_counts()
currency 1 1715020 currency 2 1224 currency 3 174 currency 4 10 Name: CREDIT_CURRENCY, dtype: int64
#one hot encode the data
bureau_data, bureau_data_cat_columns, _ = one_hot_encoding_dataframe(bureau_data)
bureau_data.describe()
| SK_ID_CURR | SK_ID_BUREAU | DAYS_CREDIT | CREDIT_DAY_OVERDUE | DAYS_CREDIT_ENDDATE | DAYS_ENDDATE_FACT | AMT_CREDIT_MAX_OVERDUE | CNT_CREDIT_PROLONG | AMT_CREDIT_SUM | AMT_CREDIT_SUM_DEBT | AMT_CREDIT_SUM_LIMIT | AMT_CREDIT_SUM_OVERDUE | DAYS_CREDIT_UPDATE | AMT_ANNUITY | CREDIT_ACTIVE_Active | CREDIT_ACTIVE_Bad debt | CREDIT_ACTIVE_Closed | CREDIT_ACTIVE_Sold | CREDIT_CURRENCY_currency 1 | CREDIT_CURRENCY_currency 2 | CREDIT_CURRENCY_currency 3 | CREDIT_CURRENCY_currency 4 | CREDIT_TYPE_Another type of loan | CREDIT_TYPE_Car loan | CREDIT_TYPE_Cash loan (non-earmarked) | CREDIT_TYPE_Consumer credit | CREDIT_TYPE_Credit card | CREDIT_TYPE_Interbank credit | CREDIT_TYPE_Loan for business development | CREDIT_TYPE_Loan for purchase of shares (margin lending) | CREDIT_TYPE_Loan for the purchase of equipment | CREDIT_TYPE_Loan for working capital replenishment | CREDIT_TYPE_Microloan | CREDIT_TYPE_Mobile operator loan | CREDIT_TYPE_Mortgage | CREDIT_TYPE_Real estate loan | CREDIT_TYPE_Unknown type of loan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.610875e+06 | 1.082775e+06 | 5.919400e+05 | 1.716428e+06 | 1.716415e+06 | 1.458759e+06 | 1.124648e+06 | 1.716428e+06 | 1.716428e+06 | 4.896370e+05 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 | 1.716428e+06 |
| mean | 2.782149e+05 | 5.924434e+06 | -1.142108e+03 | 8.181666e-01 | 5.105174e+02 | -1.017437e+03 | 3.825418e+03 | 6.410406e-03 | 3.549946e+05 | 1.370851e+05 | 6.229515e+03 | 3.791276e+01 | -5.937483e+02 | 1.571276e+04 | 3.673950e-01 | 1.223471e-05 | 6.287901e-01 | 3.802665e-03 | 9.991797e-01 | 7.131089e-04 | 1.013733e-04 | 5.826053e-06 | 5.925096e-04 | 1.613234e-02 | 3.262590e-05 | 7.291975e-01 | 2.343209e-01 | 5.826053e-07 | 1.150645e-03 | 2.330421e-06 | 1.106950e-05 | 2.732419e-04 | 7.231879e-03 | 5.826053e-07 | 1.071469e-02 | 1.573034e-05 | 3.233459e-04 |
| std | 1.029386e+05 | 5.322657e+05 | 7.951649e+02 | 3.654443e+01 | 4.994220e+03 | 7.140106e+02 | 2.060316e+05 | 9.622391e-02 | 1.149811e+06 | 6.774011e+05 | 4.503203e+04 | 5.937650e+03 | 7.207473e+02 | 3.258269e+05 | 4.820955e-01 | 3.497795e-03 | 4.831286e-01 | 6.154841e-02 | 2.862928e-02 | 2.669458e-02 | 1.006792e-02 | 2.413715e-03 | 2.433431e-02 | 1.259845e-01 | 5.711817e-03 | 4.443744e-01 | 4.235738e-01 | 7.632858e-04 | 3.390165e-02 | 1.526570e-03 | 3.327068e-03 | 1.652778e-02 | 8.473242e-02 | 7.632858e-04 | 1.029558e-01 | 3.966120e-03 | 1.797892e-02 |
| min | 1.000010e+05 | 5.000000e+06 | -2.922000e+03 | 0.000000e+00 | -4.206000e+04 | -4.202300e+04 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -4.705600e+06 | -5.864061e+05 | 0.000000e+00 | -4.194700e+04 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 1.888668e+05 | 5.463954e+06 | -1.666000e+03 | 0.000000e+00 | -1.138000e+03 | -1.489000e+03 | 0.000000e+00 | 0.000000e+00 | 5.130000e+04 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -9.080000e+02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 50% | 2.780550e+05 | 5.926304e+06 | -9.870000e+02 | 0.000000e+00 | -3.300000e+02 | -8.970000e+02 | 0.000000e+00 | 0.000000e+00 | 1.255185e+05 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -3.950000e+02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 75% | 3.674260e+05 | 6.385681e+06 | -4.740000e+02 | 0.000000e+00 | 4.740000e+02 | -4.250000e+02 | 0.000000e+00 | 0.000000e+00 | 3.150000e+05 | 4.015350e+04 | 0.000000e+00 | 0.000000e+00 | -3.300000e+01 | 1.350000e+04 | 1.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| max | 4.562550e+05 | 6.843457e+06 | 0.000000e+00 | 2.792000e+03 | 3.119900e+04 | 0.000000e+00 | 1.159872e+08 | 9.000000e+00 | 5.850000e+08 | 1.701000e+08 | 4.705600e+06 | 3.756681e+06 | 3.720000e+02 | 1.184534e+08 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
bureau_data_active = bureau_data[bureau_data['CREDIT_ACTIVE_Active'] == 1]
bureau_data['SEC_LOAN_COUNT']=(bureau_data[['CREDIT_TYPE_Car loan','CREDIT_TYPE_Loan for the purchase of equipment','CREDIT_TYPE_Mortgage','CREDIT_TYPE_Real estate loan','CREDIT_TYPE_Loan for purchase of shares (margin lending)'
]]==1).sum(axis=1)
bureau_data['UNSEC_LOAN_COUNT']=(bureau_data[[ 'CREDIT_TYPE_Another type of loan',
'CREDIT_TYPE_Cash loan (non-earmarked)', 'CREDIT_TYPE_Consumer credit',
'CREDIT_TYPE_Credit card', 'CREDIT_TYPE_Interbank credit',
'CREDIT_TYPE_Loan for business development',
'CREDIT_TYPE_Loan for working capital replenishment',
'CREDIT_TYPE_Microloan', 'CREDIT_TYPE_Mobile operator loan',
'CREDIT_TYPE_Unknown type of loan']]==1).sum(axis=1)
bureau_data['DEBT_RATIO']=bureau_data['AMT_CREDIT_SUM_DEBT']/bureau_data['AMT_CREDIT_SUM']
#remove the oulier days "365243"
for col in bureau_data.columns:
if col.startswith('DAYS'):
bureau_data[col].replace(365243, np.nan, inplace= True)
bureau_data.head(5)
| SK_ID_CURR | SK_ID_BUREAU | DAYS_CREDIT | CREDIT_DAY_OVERDUE | DAYS_CREDIT_ENDDATE | DAYS_ENDDATE_FACT | AMT_CREDIT_MAX_OVERDUE | CNT_CREDIT_PROLONG | AMT_CREDIT_SUM | AMT_CREDIT_SUM_DEBT | AMT_CREDIT_SUM_LIMIT | AMT_CREDIT_SUM_OVERDUE | DAYS_CREDIT_UPDATE | AMT_ANNUITY | CREDIT_ACTIVE_Active | CREDIT_ACTIVE_Bad debt | CREDIT_ACTIVE_Closed | CREDIT_ACTIVE_Sold | CREDIT_CURRENCY_currency 1 | CREDIT_CURRENCY_currency 2 | CREDIT_CURRENCY_currency 3 | CREDIT_CURRENCY_currency 4 | CREDIT_TYPE_Another type of loan | CREDIT_TYPE_Car loan | CREDIT_TYPE_Cash loan (non-earmarked) | CREDIT_TYPE_Consumer credit | CREDIT_TYPE_Credit card | CREDIT_TYPE_Interbank credit | CREDIT_TYPE_Loan for business development | CREDIT_TYPE_Loan for purchase of shares (margin lending) | CREDIT_TYPE_Loan for the purchase of equipment | CREDIT_TYPE_Loan for working capital replenishment | CREDIT_TYPE_Microloan | CREDIT_TYPE_Mobile operator loan | CREDIT_TYPE_Mortgage | CREDIT_TYPE_Real estate loan | CREDIT_TYPE_Unknown type of loan | SEC_LOAN_COUNT | UNSEC_LOAN_COUNT | DEBT_RATIO | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 215354 | 5714462 | -497 | 0 | -153.0 | -153.0 | NaN | 0 | 91323.0 | 0.0 | NaN | 0.0 | -131 | NaN | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.00000 |
| 1 | 215354 | 5714463 | -208 | 0 | 1075.0 | NaN | NaN | 0 | 225000.0 | 171342.0 | NaN | 0.0 | -20 | NaN | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.76152 |
| 2 | 215354 | 5714464 | -203 | 0 | 528.0 | NaN | NaN | 0 | 464323.5 | NaN | NaN | 0.0 | -16 | NaN | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | NaN |
| 3 | 215354 | 5714465 | -203 | 0 | NaN | NaN | NaN | 0 | 90000.0 | NaN | NaN | 0.0 | -16 | NaN | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | NaN |
| 4 | 215354 | 5714466 | -629 | 0 | 1197.0 | NaN | 77674.5 | 0 | 2700000.0 | NaN | NaN | 0.0 | -21 | NaN | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | NaN |
bureau_balance_data = pd.read_csv('../donnees/bureau_balance.csv')
bureau_balance_data.shape
(27299925, 3)
bureau_balance_data.head()
| SK_ID_BUREAU | MONTHS_BALANCE | STATUS | |
|---|---|---|---|
| 0 | 5715448 | 0 | C |
| 1 | 5715448 | -1 | C |
| 2 | 5715448 | -2 | C |
| 3 | 5715448 | -3 | C |
| 4 | 5715448 | -4 | C |
bureau_balance_data['MONTHS_BALANCE'].describe()
count 2.729992e+07 mean -3.074169e+01 std 2.386451e+01 min -9.600000e+01 25% -4.600000e+01 50% -2.500000e+01 75% -1.100000e+01 max 0.000000e+00 Name: MONTHS_BALANCE, dtype: float64
#one hot encoding
bureau_balance_data, bureau_balance_data_cat_columns, _ = one_hot_encoding_dataframe(bureau_balance_data)
bureau_balance_data.describe()
| SK_ID_BUREAU | MONTHS_BALANCE | STATUS_0 | STATUS_1 | STATUS_2 | STATUS_3 | STATUS_4 | STATUS_5 | STATUS_C | STATUS_X | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.729992e+07 | 2.729992e+07 | 2.729992e+07 | 2.729992e+07 | 2.729992e+07 | 2.729992e+07 | 2.729992e+07 | 2.729992e+07 | 2.729992e+07 | 2.729992e+07 |
| mean | 6.036297e+06 | -3.074169e+01 | 2.747080e-01 | 8.877204e-03 | 8.578412e-04 | 3.268873e-04 | 2.141764e-04 | 2.285940e-03 | 4.998912e-01 | 2.128388e-01 |
| std | 4.923489e+05 | 2.386451e+01 | 4.463670e-01 | 9.379978e-02 | 2.927636e-02 | 1.807707e-02 | 1.463320e-02 | 4.775683e-02 | 5.000000e-01 | 4.093146e-01 |
| min | 5.001709e+06 | -9.600000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 5.730933e+06 | -4.600000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 50% | 6.070821e+06 | -2.500000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 75% | 6.431951e+06 | -1.100000e+01 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 |
| max | 6.842888e+06 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
bureau_balance_data.isnull().sum()
SK_ID_BUREAU 0 MONTHS_BALANCE 0 STATUS_0 0 STATUS_1 0 STATUS_2 0 STATUS_3 0 STATUS_4 0 STATUS_5 0 STATUS_C 0 STATUS_X 0 dtype: int64
#Aggregate function to be applied on numerical column
bureau_balance_agg = {'MONTHS_BALANCE': ['min', 'max', 'size','mean','var']}
#Aggregate function to be applied on cat column
for col in bureau_balance_data_cat_columns:
bureau_balance_agg[col] = ['mean']
bureau_balance_data_agg = bureau_balance_data.groupby('SK_ID_BUREAU').agg(bureau_balance_agg)
month = -24
bureau_balance_data_temp = bureau_balance_data[bureau_balance_data.MONTHS_BALANCE >= month].copy()
bureau_balance_data_agg['STATUS_12_C'] = bureau_balance_data_temp.groupby('SK_ID_BUREAU')['STATUS_C'].mean()
month = -12
bureau_balance_data_temp = bureau_balance_data[bureau_balance_data.MONTHS_BALANCE >= month].copy()
bureau_balance_data_agg['STATUS_12_C'] = bureau_balance_data_temp.groupby('SK_ID_BUREAU')['STATUS_C'].mean()
month = -9
bureau_balance_data_temp = bureau_balance_data[bureau_balance_data.MONTHS_BALANCE >= month].copy()
bureau_balance_data_agg['STATUS_9_C'] = bureau_balance_data_temp.groupby('SK_ID_BUREAU')['STATUS_C'].mean()
month = -6
bureau_balance_data_temp = bureau_balance_data[bureau_balance_data.MONTHS_BALANCE >= month].copy()
bureau_balance_data_agg['STATUS_6_C'] = bureau_balance_data_temp.groupby('SK_ID_BUREAU')['STATUS_C'].mean()
month = -3
bureau_balance_data_temp = bureau_balance_data[bureau_balance_data.MONTHS_BALANCE >= month].copy()
bureau_balance_data_agg['STATUS_3_C'] = bureau_balance_data_temp.groupby('SK_ID_BUREAU')['STATUS_C'].mean()
import gc
del bureau_balance_data_temp
gc.collect()
57407
bureau_balance_data_agg.head()
| MONTHS_BALANCE | STATUS_4 | STATUS_3 | STATUS_5 | STATUS_X | STATUS_C | STATUS_0 | STATUS_2 | STATUS_1 | STATUS_12_C | STATUS_9_C | STATUS_6_C | STATUS_3_C | |||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| min | max | size | mean | var | mean | mean | mean | mean | mean | mean | mean | mean | |||||
| SK_ID_BUREAU | |||||||||||||||||
| 5001709 | -96 | 0 | 97 | -48.0 | 792.166667 | 0.0 | 0.0 | 0.0 | 0.113402 | 0.886598 | 0.000000 | 0.0 | 0.0 | 1.000000 | 1.0 | 1.0 | 1.0 |
| 5001710 | -82 | 0 | 83 | -41.0 | 581.000000 | 0.0 | 0.0 | 0.0 | 0.361446 | 0.578313 | 0.060241 | 0.0 | 0.0 | 1.000000 | 1.0 | 1.0 | 1.0 |
| 5001711 | -3 | 0 | 4 | -1.5 | 1.666667 | 0.0 | 0.0 | 0.0 | 0.250000 | 0.000000 | 0.750000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 5001712 | -18 | 0 | 19 | -9.0 | 31.666667 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.473684 | 0.526316 | 0.0 | 0.0 | 0.692308 | 0.9 | 1.0 | 1.0 |
| 5001713 | -21 | 0 | 22 | -10.5 | 42.166667 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
# unstack columns names
modified_col=[]
for c in list(bureau_balance_data_agg.columns):
modified_col.append(c[0]+"_"+c[1].upper())
bureau_balance_data_agg.columns=modified_col
bureau_balance_data_agg.head()
| MONTHS_BALANCE_MIN | MONTHS_BALANCE_MAX | MONTHS_BALANCE_SIZE | MONTHS_BALANCE_MEAN | MONTHS_BALANCE_VAR | STATUS_4_MEAN | STATUS_3_MEAN | STATUS_5_MEAN | STATUS_X_MEAN | STATUS_C_MEAN | STATUS_0_MEAN | STATUS_2_MEAN | STATUS_1_MEAN | STATUS_12_C_ | STATUS_9_C_ | STATUS_6_C_ | STATUS_3_C_ | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_BUREAU | |||||||||||||||||
| 5001709 | -96 | 0 | 97 | -48.0 | 792.166667 | 0.0 | 0.0 | 0.0 | 0.113402 | 0.886598 | 0.000000 | 0.0 | 0.0 | 1.000000 | 1.0 | 1.0 | 1.0 |
| 5001710 | -82 | 0 | 83 | -41.0 | 581.000000 | 0.0 | 0.0 | 0.0 | 0.361446 | 0.578313 | 0.060241 | 0.0 | 0.0 | 1.000000 | 1.0 | 1.0 | 1.0 |
| 5001711 | -3 | 0 | 4 | -1.5 | 1.666667 | 0.0 | 0.0 | 0.0 | 0.250000 | 0.000000 | 0.750000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 5001712 | -18 | 0 | 19 | -9.0 | 31.666667 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.473684 | 0.526316 | 0.0 | 0.0 | 0.692308 | 0.9 | 1.0 | 1.0 |
| 5001713 | -21 | 0 | 22 | -10.5 | 42.166667 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
# bureau_balance_data_agg['SK_ID_BUREAU'] = bureau_balance_data_agg.index
bureau_data = bureau_data.join(bureau_balance_data_agg, how='left', on='SK_ID_BUREAU')
bureau_data.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
import gc
del bureau_balance_data_agg
gc.collect()
0
#mean aggregate
bureau_data_agg={}
for col in bureau_data.columns:
if col!='SK_ID_CURR':
bureau_data_agg[col]=['mean']
if (col=='AMT_CREDIT_SUM_DEBT') | (col=='AMT_CREDIT_SUM_OVERDUE') | (col=='UNSEC_LOAN_COUNT') |(col=='SEC_LOAN_COUNT'):
bureau_data_agg[col]=['sum']
if col=='DAYS_CREDIT':
bureau_data_agg[col]=['min','mean']
if col=='DEBT_RATIO':
bureau_data_agg[col]=['sum']
bureau_agg = bureau_data.groupby('SK_ID_CURR').agg(bureau_data_agg)
bureau_agg.head()
| DAYS_CREDIT | CREDIT_DAY_OVERDUE | DAYS_CREDIT_ENDDATE | DAYS_ENDDATE_FACT | AMT_CREDIT_MAX_OVERDUE | CNT_CREDIT_PROLONG | AMT_CREDIT_SUM | AMT_CREDIT_SUM_DEBT | AMT_CREDIT_SUM_LIMIT | AMT_CREDIT_SUM_OVERDUE | DAYS_CREDIT_UPDATE | AMT_ANNUITY | CREDIT_ACTIVE_Active | CREDIT_ACTIVE_Bad debt | CREDIT_ACTIVE_Closed | CREDIT_ACTIVE_Sold | CREDIT_CURRENCY_currency 1 | CREDIT_CURRENCY_currency 2 | CREDIT_CURRENCY_currency 3 | CREDIT_CURRENCY_currency 4 | CREDIT_TYPE_Another type of loan | CREDIT_TYPE_Car loan | CREDIT_TYPE_Cash loan (non-earmarked) | CREDIT_TYPE_Consumer credit | CREDIT_TYPE_Credit card | CREDIT_TYPE_Interbank credit | CREDIT_TYPE_Loan for business development | CREDIT_TYPE_Loan for purchase of shares (margin lending) | CREDIT_TYPE_Loan for the purchase of equipment | CREDIT_TYPE_Loan for working capital replenishment | CREDIT_TYPE_Microloan | CREDIT_TYPE_Mobile operator loan | CREDIT_TYPE_Mortgage | CREDIT_TYPE_Real estate loan | CREDIT_TYPE_Unknown type of loan | SEC_LOAN_COUNT | UNSEC_LOAN_COUNT | DEBT_RATIO | MONTHS_BALANCE_MIN | MONTHS_BALANCE_MAX | MONTHS_BALANCE_SIZE | MONTHS_BALANCE_MEAN | MONTHS_BALANCE_VAR | STATUS_4_MEAN | STATUS_3_MEAN | STATUS_5_MEAN | STATUS_X_MEAN | STATUS_C_MEAN | STATUS_0_MEAN | STATUS_2_MEAN | STATUS_1_MEAN | STATUS_12_C_ | STATUS_9_C_ | STATUS_6_C_ | STATUS_3_C_ | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| min | mean | mean | mean | mean | mean | mean | mean | sum | mean | sum | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | sum | sum | sum | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | |
| SK_ID_CURR | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100001 | -1572 | -735.000000 | 0.0 | 82.428571 | -825.500000 | NaN | 0.0 | 207623.571429 | 596686.5 | 0.00000 | 0.0 | -93.142857 | 3545.357143 | 0.428571 | 0.0 | 0.571429 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 7 | 1.977625 | -23.571429 | 0.0 | 24.571429 | -11.785714 | 70.761905 | 0.0 | 0.0 | 0.0 | 0.214590 | 0.441240 | 0.336651 | 0.0 | 0.007519 | 0.571429 | 0.571429 | 0.571429 | 0.571429 |
| 100002 | -1437 | -874.000000 | 0.0 | -349.000000 | -697.500000 | 1681.029 | 0.0 | 108131.945625 | 245781.0 | 7997.14125 | 0.0 | -499.875000 | 0.000000 | 0.250000 | 0.0 | 0.750000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 8 | 0.546180 | -28.250000 | -15.5 | 13.750000 | -21.875000 | 19.854167 | 0.0 | 0.0 | 0.0 | 0.161932 | 0.175426 | 0.406960 | 0.0 | 0.255682 | 0.500000 | 0.500000 | 0.500000 | 0.500000 |
| 100003 | -2586 | -1400.750000 | 0.0 | -544.500000 | -1097.333333 | 0.000 | 0.0 | 254350.125000 | 0.0 | 202500.00000 | 0.0 | -816.000000 | NaN | 0.250000 | 0.0 | 0.750000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 4 | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 100004 | -1326 | -867.000000 | 0.0 | -488.500000 | -532.500000 | 0.000 | 0.0 | 94518.900000 | 0.0 | 0.00000 | 0.0 | -532.000000 | NaN | 0.000000 | 0.0 | 1.000000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 2 | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 100005 | -373 | -190.666667 | 0.0 | 439.333333 | -123.000000 | 0.000 | 0.0 | 219042.000000 | 568408.5 | 0.00000 | 0.0 | -54.333333 | 1420.500000 | 0.666667 | 0.0 | 0.333333 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.666667 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 3 | 1.803768 | -6.000000 | 0.0 | 7.000000 | -3.000000 | 6.222222 | 0.0 | 0.0 | 0.0 | 0.136752 | 0.128205 | 0.735043 | 0.0 | 0.000000 | 0.128205 | 0.166667 | 0.238095 | 0.333333 |
# unstack column names
modified_col=[]
for c in list(bureau_agg.columns):
modified_col.append(c[0]+"_"+c[1].upper())
bureau_agg.columns=modified_col
bureau_agg.head()
| DAYS_CREDIT_MIN | DAYS_CREDIT_MEAN | CREDIT_DAY_OVERDUE_MEAN | DAYS_CREDIT_ENDDATE_MEAN | DAYS_ENDDATE_FACT_MEAN | AMT_CREDIT_MAX_OVERDUE_MEAN | CNT_CREDIT_PROLONG_MEAN | AMT_CREDIT_SUM_MEAN | AMT_CREDIT_SUM_DEBT_SUM | AMT_CREDIT_SUM_LIMIT_MEAN | AMT_CREDIT_SUM_OVERDUE_SUM | DAYS_CREDIT_UPDATE_MEAN | AMT_ANNUITY_MEAN | CREDIT_ACTIVE_Active_MEAN | CREDIT_ACTIVE_Bad debt_MEAN | CREDIT_ACTIVE_Closed_MEAN | CREDIT_ACTIVE_Sold_MEAN | CREDIT_CURRENCY_currency 1_MEAN | CREDIT_CURRENCY_currency 2_MEAN | CREDIT_CURRENCY_currency 3_MEAN | CREDIT_CURRENCY_currency 4_MEAN | CREDIT_TYPE_Another type of loan_MEAN | CREDIT_TYPE_Car loan_MEAN | CREDIT_TYPE_Cash loan (non-earmarked)_MEAN | CREDIT_TYPE_Consumer credit_MEAN | CREDIT_TYPE_Credit card_MEAN | CREDIT_TYPE_Interbank credit_MEAN | CREDIT_TYPE_Loan for business development_MEAN | CREDIT_TYPE_Loan for purchase of shares (margin lending)_MEAN | CREDIT_TYPE_Loan for the purchase of equipment_MEAN | CREDIT_TYPE_Loan for working capital replenishment_MEAN | CREDIT_TYPE_Microloan_MEAN | CREDIT_TYPE_Mobile operator loan_MEAN | CREDIT_TYPE_Mortgage_MEAN | CREDIT_TYPE_Real estate loan_MEAN | CREDIT_TYPE_Unknown type of loan_MEAN | SEC_LOAN_COUNT_SUM | UNSEC_LOAN_COUNT_SUM | DEBT_RATIO_SUM | MONTHS_BALANCE_MIN_MEAN | MONTHS_BALANCE_MAX_MEAN | MONTHS_BALANCE_SIZE_MEAN | MONTHS_BALANCE_MEAN_MEAN | MONTHS_BALANCE_VAR_MEAN | STATUS_4_MEAN_MEAN | STATUS_3_MEAN_MEAN | STATUS_5_MEAN_MEAN | STATUS_X_MEAN_MEAN | STATUS_C_MEAN_MEAN | STATUS_0_MEAN_MEAN | STATUS_2_MEAN_MEAN | STATUS_1_MEAN_MEAN | STATUS_12_C__MEAN | STATUS_9_C__MEAN | STATUS_6_C__MEAN | STATUS_3_C__MEAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100001 | -1572 | -735.000000 | 0.0 | 82.428571 | -825.500000 | NaN | 0.0 | 207623.571429 | 596686.5 | 0.00000 | 0.0 | -93.142857 | 3545.357143 | 0.428571 | 0.0 | 0.571429 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 7 | 1.977625 | -23.571429 | 0.0 | 24.571429 | -11.785714 | 70.761905 | 0.0 | 0.0 | 0.0 | 0.214590 | 0.441240 | 0.336651 | 0.0 | 0.007519 | 0.571429 | 0.571429 | 0.571429 | 0.571429 |
| 100002 | -1437 | -874.000000 | 0.0 | -349.000000 | -697.500000 | 1681.029 | 0.0 | 108131.945625 | 245781.0 | 7997.14125 | 0.0 | -499.875000 | 0.000000 | 0.250000 | 0.0 | 0.750000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 8 | 0.546180 | -28.250000 | -15.5 | 13.750000 | -21.875000 | 19.854167 | 0.0 | 0.0 | 0.0 | 0.161932 | 0.175426 | 0.406960 | 0.0 | 0.255682 | 0.500000 | 0.500000 | 0.500000 | 0.500000 |
| 100003 | -2586 | -1400.750000 | 0.0 | -544.500000 | -1097.333333 | 0.000 | 0.0 | 254350.125000 | 0.0 | 202500.00000 | 0.0 | -816.000000 | NaN | 0.250000 | 0.0 | 0.750000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 4 | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 100004 | -1326 | -867.000000 | 0.0 | -488.500000 | -532.500000 | 0.000 | 0.0 | 94518.900000 | 0.0 | 0.00000 | 0.0 | -532.000000 | NaN | 0.000000 | 0.0 | 1.000000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 2 | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 100005 | -373 | -190.666667 | 0.0 | 439.333333 | -123.000000 | 0.000 | 0.0 | 219042.000000 | 568408.5 | 0.00000 | 0.0 | -54.333333 | 1420.500000 | 0.666667 | 0.0 | 0.333333 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.666667 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 3 | 1.803768 | -6.000000 | 0.0 | 7.000000 | -3.000000 | 6.222222 | 0.0 | 0.0 | 0.0 | 0.136752 | 0.128205 | 0.735043 | 0.0 | 0.000000 | 0.128205 | 0.166667 | 0.238095 | 0.333333 |
df_bureau = bureau_agg.join(app_train.TARGET, how='inner', on='SK_ID_CURR')
df_bureau.name = 'df_bureau'
df_bureau.shape
(263491, 57)
df_bureau.head()
| DAYS_CREDIT_MIN | DAYS_CREDIT_MEAN | CREDIT_DAY_OVERDUE_MEAN | DAYS_CREDIT_ENDDATE_MEAN | DAYS_ENDDATE_FACT_MEAN | AMT_CREDIT_MAX_OVERDUE_MEAN | CNT_CREDIT_PROLONG_MEAN | AMT_CREDIT_SUM_MEAN | AMT_CREDIT_SUM_DEBT_SUM | AMT_CREDIT_SUM_LIMIT_MEAN | AMT_CREDIT_SUM_OVERDUE_SUM | DAYS_CREDIT_UPDATE_MEAN | AMT_ANNUITY_MEAN | CREDIT_ACTIVE_Active_MEAN | CREDIT_ACTIVE_Bad debt_MEAN | CREDIT_ACTIVE_Closed_MEAN | CREDIT_ACTIVE_Sold_MEAN | CREDIT_CURRENCY_currency 1_MEAN | CREDIT_CURRENCY_currency 2_MEAN | CREDIT_CURRENCY_currency 3_MEAN | CREDIT_CURRENCY_currency 4_MEAN | CREDIT_TYPE_Another type of loan_MEAN | CREDIT_TYPE_Car loan_MEAN | CREDIT_TYPE_Cash loan (non-earmarked)_MEAN | CREDIT_TYPE_Consumer credit_MEAN | CREDIT_TYPE_Credit card_MEAN | CREDIT_TYPE_Interbank credit_MEAN | CREDIT_TYPE_Loan for business development_MEAN | CREDIT_TYPE_Loan for purchase of shares (margin lending)_MEAN | CREDIT_TYPE_Loan for the purchase of equipment_MEAN | CREDIT_TYPE_Loan for working capital replenishment_MEAN | CREDIT_TYPE_Microloan_MEAN | CREDIT_TYPE_Mobile operator loan_MEAN | CREDIT_TYPE_Mortgage_MEAN | CREDIT_TYPE_Real estate loan_MEAN | CREDIT_TYPE_Unknown type of loan_MEAN | SEC_LOAN_COUNT_SUM | UNSEC_LOAN_COUNT_SUM | DEBT_RATIO_SUM | MONTHS_BALANCE_MIN_MEAN | MONTHS_BALANCE_MAX_MEAN | MONTHS_BALANCE_SIZE_MEAN | MONTHS_BALANCE_MEAN_MEAN | MONTHS_BALANCE_VAR_MEAN | STATUS_4_MEAN_MEAN | STATUS_3_MEAN_MEAN | STATUS_5_MEAN_MEAN | STATUS_X_MEAN_MEAN | STATUS_C_MEAN_MEAN | STATUS_0_MEAN_MEAN | STATUS_2_MEAN_MEAN | STATUS_1_MEAN_MEAN | STATUS_12_C__MEAN | STATUS_9_C__MEAN | STATUS_6_C__MEAN | STATUS_3_C__MEAN | TARGET | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100002 | -1437 | -874.000000 | 0.0 | -349.000000 | -697.500000 | 1681.029 | 0.0 | 108131.945625 | 245781.0 | 7997.14125 | 0.0 | -499.875 | 0.0 | 0.250000 | 0.0 | 0.750000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.5 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 8 | 0.546180 | -28.25 | -15.5 | 13.75 | -21.875 | 19.854167 | 0.0 | 0.0 | 0.0 | 0.161932 | 0.175426 | 0.40696 | 0.0 | 0.255682 | 0.5 | 0.5 | 0.5 | 0.5 | 1 |
| 100003 | -2586 | -1400.750000 | 0.0 | -544.500000 | -1097.333333 | 0.000 | 0.0 | 254350.125000 | 0.0 | 202500.00000 | 0.0 | -816.000 | NaN | 0.250000 | 0.0 | 0.750000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.5 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 4 | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
| 100004 | -1326 | -867.000000 | 0.0 | -488.500000 | -532.500000 | 0.000 | 0.0 | 94518.900000 | 0.0 | 0.00000 | 0.0 | -532.000 | NaN | 0.000000 | 0.0 | 1.000000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 2 | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
| 100007 | -1149 | -1149.000000 | 0.0 | -783.000000 | -783.000000 | 0.000 | 0.0 | 146250.000000 | 0.0 | 0.00000 | 0.0 | -783.000 | NaN | 0.000000 | 0.0 | 1.000000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 1 | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
| 100008 | -1097 | -757.333333 | 0.0 | -391.333333 | -909.000000 | 0.000 | 0.0 | 156148.500000 | 240057.0 | 0.00000 | 0.0 | -611.000 | NaN | 0.333333 | 0.0 | 0.666667 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 3 | 0.897054 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
numerical_summary(df_bureau)
Data Frame a 57 colonnes. Dont 57 colonnes contiennent des valeurs manquantes.
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| MONTHS_BALANCE_SIZE_MEAN | -0.08 | 171260 | 65.00 |
| CREDIT_ACTIVE_Closed_MEAN | -0.08 | 0 | 0.00 |
| MONTHS_BALANCE_VAR_MEAN | -0.06 | 171491 | 65.08 |
| STATUS_12_C__MEAN | -0.06 | 173436 | 65.82 |
| STATUS_9_C__MEAN | -0.06 | 173795 | 65.96 |
| STATUS_6_C__MEAN | -0.06 | 174245 | 66.13 |
| STATUS_C_MEAN_MEAN | -0.06 | 171260 | 65.00 |
| STATUS_3_C__MEAN | -0.06 | 174791 | 66.34 |
| SEC_LOAN_COUNT_SUM | -0.03 | 0 | 0.00 |
| CREDIT_TYPE_Consumer credit_MEAN | -0.03 | 0 | 0.00 |
| CREDIT_TYPE_Mortgage_MEAN | -0.02 | 0 | 0.00 |
| CREDIT_TYPE_Car loan_MEAN | -0.02 | 0 | 0.00 |
| AMT_CREDIT_SUM_MEAN | -0.02 | 1 | 0.00 |
| AMT_CREDIT_SUM_LIMIT_MEAN | -0.01 | 21049 | 7.99 |
| CREDIT_CURRENCY_currency 2_MEAN | -0.01 | 0 | 0.00 |
| CREDIT_TYPE_Loan for business development_MEAN | -0.00 | 0 | 0.00 |
| CREDIT_TYPE_Unknown type of loan_MEAN | -0.00 | 0 | 0.00 |
| CREDIT_TYPE_Real estate loan_MEAN | -0.00 | 0 | 0.00 |
| CREDIT_TYPE_Another type of loan_MEAN | -0.00 | 0 | 0.00 |
| CREDIT_CURRENCY_currency 4_MEAN | -0.00 | 0 | 0.00 |
| AMT_ANNUITY_MEAN | -0.00 | 183482 | 69.64 |
| CREDIT_TYPE_Cash loan (non-earmarked)_MEAN | -0.00 | 0 | 0.00 |
| CREDIT_TYPE_Loan for purchase of shares (margin lending)_MEAN | -0.00 | 0 | 0.00 |
| CREDIT_TYPE_Interbank credit_MEAN | -0.00 | 0 | 0.00 |
| CREDIT_TYPE_Mobile operator loan_MEAN | -0.00 | 0 | 0.00 |
| DEBT_RATIO_SUM | 0.00 | 32 | 0.01 |
| CREDIT_TYPE_Loan for the purchase of equipment_MEAN | 0.00 | 0 | 0.00 |
| STATUS_X_MEAN_MEAN | 0.00 | 171260 | 65.00 |
| CREDIT_TYPE_Loan for working capital replenishment_MEAN | 0.00 | 0 | 0.00 |
| AMT_CREDIT_MAX_OVERDUE_MEAN | 0.00 | 79605 | 30.21 |
| CREDIT_CURRENCY_currency 3_MEAN | 0.00 | 0 | 0.00 |
| CNT_CREDIT_PROLONG_MEAN | 0.00 | 0 | 0.00 |
| CREDIT_ACTIVE_Bad debt_MEAN | 0.00 | 0 | 0.00 |
| CREDIT_CURRENCY_currency 1_MEAN | 0.01 | 0 | 0.00 |
| AMT_CREDIT_SUM_DEBT_SUM | 0.01 | 0 | 0.00 |
| UNSEC_LOAN_COUNT_SUM | 0.01 | 0 | 0.00 |
| CREDIT_DAY_OVERDUE_MEAN | 0.01 | 0 | 0.00 |
| AMT_CREDIT_SUM_OVERDUE_SUM | 0.01 | 0 | 0.00 |
| STATUS_4_MEAN_MEAN | 0.02 | 171260 | 65.00 |
| CREDIT_ACTIVE_Sold_MEAN | 0.02 | 0 | 0.00 |
| STATUS_3_MEAN_MEAN | 0.02 | 171260 | 65.00 |
| STATUS_2_MEAN_MEAN | 0.02 | 171260 | 65.00 |
| STATUS_5_MEAN_MEAN | 0.02 | 171260 | 65.00 |
| MONTHS_BALANCE_MAX_MEAN | 0.02 | 171260 | 65.00 |
| CREDIT_TYPE_Credit card_MEAN | 0.03 | 0 | 0.00 |
| STATUS_0_MEAN_MEAN | 0.04 | 171260 | 65.00 |
| CREDIT_TYPE_Microloan_MEAN | 0.04 | 0 | 0.00 |
| DAYS_CREDIT_ENDDATE_MEAN | 0.05 | 2249 | 0.85 |
| DAYS_ENDDATE_FACT_MEAN | 0.05 | 33136 | 12.58 |
| STATUS_1_MEAN_MEAN | 0.06 | 171260 | 65.00 |
| DAYS_CREDIT_UPDATE_MEAN | 0.07 | 0 | 0.00 |
| DAYS_CREDIT_MIN | 0.08 | 0 | 0.00 |
| MONTHS_BALANCE_MEAN_MEAN | 0.08 | 171260 | 65.00 |
| CREDIT_ACTIVE_Active_MEAN | 0.08 | 0 | 0.00 |
| MONTHS_BALANCE_MIN_MEAN | 0.09 | 171260 | 65.00 |
| DAYS_CREDIT_MEAN | 0.09 | 0 | 0.00 |
| TARGET | 1.00 | 0 | 0.00 |
bureau = select_numerical(df_bureau, correlation=0.03, missing=70).drop('TARGET', axis=1)
bureau
Data Frame a 57 colonnes. Dont 57 colonnes contiennent des valeurs manquantes.
'>>>> Most correlated with TARGET from df_bureau'
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| MONTHS_BALANCE_SIZE_MEAN | -0.08 | 171260 | 65.00 |
| CREDIT_ACTIVE_Closed_MEAN | -0.08 | 0 | 0.00 |
| MONTHS_BALANCE_VAR_MEAN | -0.06 | 171491 | 65.08 |
| STATUS_12_C__MEAN | -0.06 | 173436 | 65.82 |
| STATUS_9_C__MEAN | -0.06 | 173795 | 65.96 |
| STATUS_6_C__MEAN | -0.06 | 174245 | 66.13 |
| STATUS_C_MEAN_MEAN | -0.06 | 171260 | 65.00 |
| STATUS_3_C__MEAN | -0.06 | 174791 | 66.34 |
| STATUS_0_MEAN_MEAN | 0.04 | 171260 | 65.00 |
| CREDIT_TYPE_Microloan_MEAN | 0.04 | 0 | 0.00 |
| DAYS_CREDIT_ENDDATE_MEAN | 0.05 | 2249 | 0.85 |
| DAYS_ENDDATE_FACT_MEAN | 0.05 | 33136 | 12.58 |
| STATUS_1_MEAN_MEAN | 0.06 | 171260 | 65.00 |
| DAYS_CREDIT_UPDATE_MEAN | 0.07 | 0 | 0.00 |
| DAYS_CREDIT_MIN | 0.08 | 0 | 0.00 |
| MONTHS_BALANCE_MEAN_MEAN | 0.08 | 171260 | 65.00 |
| CREDIT_ACTIVE_Active_MEAN | 0.08 | 0 | 0.00 |
| MONTHS_BALANCE_MIN_MEAN | 0.09 | 171260 | 65.00 |
| DAYS_CREDIT_MEAN | 0.09 | 0 | 0.00 |
| TARGET | 1.00 | 0 | 0.00 |
| MONTHS_BALANCE_SIZE_MEAN | CREDIT_ACTIVE_Closed_MEAN | MONTHS_BALANCE_VAR_MEAN | STATUS_12_C__MEAN | STATUS_9_C__MEAN | STATUS_6_C__MEAN | STATUS_C_MEAN_MEAN | STATUS_3_C__MEAN | STATUS_0_MEAN_MEAN | CREDIT_TYPE_Microloan_MEAN | DAYS_CREDIT_ENDDATE_MEAN | DAYS_ENDDATE_FACT_MEAN | STATUS_1_MEAN_MEAN | DAYS_CREDIT_UPDATE_MEAN | DAYS_CREDIT_MIN | MONTHS_BALANCE_MEAN_MEAN | CREDIT_ACTIVE_Active_MEAN | MONTHS_BALANCE_MIN_MEAN | DAYS_CREDIT_MEAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||
| 100002 | 13.750000 | 0.750000 | 19.854167 | 0.500000 | 0.500 | 0.500000 | 0.175426 | 0.500000 | 0.406960 | 0.0 | -349.000000 | -697.500000 | 0.255682 | -499.875000 | -1437 | -21.875000 | 0.250000 | -28.250000 | -874.000000 |
| 100003 | NaN | 0.750000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | -544.500000 | -1097.333333 | NaN | -816.000000 | -2586 | NaN | 0.250000 | NaN | -1400.750000 |
| 100004 | NaN | 1.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | -488.500000 | -532.500000 | NaN | -532.000000 | -1326 | NaN | 0.000000 | NaN | -867.000000 |
| 100007 | NaN | 1.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | -783.000000 | -783.000000 | NaN | -783.000000 | -1149 | NaN | 0.000000 | NaN | -1149.000000 |
| 100008 | NaN | 0.666667 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | -391.333333 | -909.000000 | NaN | -611.000000 | -1097 | NaN | 0.333333 | NaN | -757.333333 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456247 | 29.090909 | 0.727273 | 110.212121 | 0.593434 | 0.600 | 0.619048 | 0.505634 | 0.666667 | 0.325528 | 0.0 | 1449.818182 | -1085.000000 | 0.000000 | -768.818182 | -2482 | -19.863636 | 0.272727 | -33.909091 | -1043.181818 |
| 456249 | NaN | 0.846154 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | -1232.333333 | -1364.750000 | NaN | -1064.538462 | -2713 | NaN | 0.153846 | NaN | -1667.076923 |
| 456253 | 29.250000 | 0.500000 | 74.500000 | 0.750000 | 0.750 | 0.750000 | 0.459677 | 0.750000 | 0.404906 | 0.0 | 280.500000 | -794.000000 | 0.000000 | -253.250000 | -919 | -14.125000 | 0.500000 | -28.250000 | -867.500000 |
| 456254 | 37.000000 | 1.000000 | 117.166667 | 1.000000 | 1.000 | 1.000000 | 0.783784 | 1.000000 | 0.216216 | 0.0 | -859.000000 | -859.000000 | 0.000000 | -401.000000 | -1104 | -18.000000 | 0.000000 | -36.000000 | -1104.000000 |
| 456255 | 26.636364 | 0.545455 | 68.333333 | 0.713942 | 0.775 | 0.839286 | 0.645601 | 0.937500 | 0.283249 | 0.0 | 3231.272727 | -968.333333 | 0.000000 | -531.090909 | -2337 | -22.272727 | 0.454545 | -35.090909 | -1089.454545 |
263491 rows × 19 columns
frame_vs_target(bureau,app_train.TARGET,'Variables aggregated on SK_ID_CURR from bureau.csv and bureau_balance.csv ')
import gc
del bureau_data
del bureau_balance_data
del bureau_agg
del df_bureau
gc.collect()
47470
prev_data = pd.read_csv('../donnees/previous_application.csv')
prev_data.shape
(1670214, 37)
prev_data.head()
| SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | FLAG_LAST_APPL_PER_CONTRACT | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | NAME_CASH_LOAN_PURPOSE | NAME_CONTRACT_STATUS | DAYS_DECISION | NAME_PAYMENT_TYPE | CODE_REJECT_REASON | NAME_TYPE_SUITE | NAME_CLIENT_TYPE | NAME_GOODS_CATEGORY | NAME_PORTFOLIO | NAME_PRODUCT_TYPE | CHANNEL_TYPE | SELLERPLACE_AREA | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2030495 | 271877 | Consumer loans | 1730.430 | 17145.0 | 17145.0 | 0.0 | 17145.0 | SATURDAY | 15 | Y | 1 | 0.0 | 0.182832 | 0.867336 | XAP | Approved | -73 | Cash through the bank | XAP | NaN | Repeater | Mobile | POS | XNA | Country-wide | 35 | Connectivity | 12.0 | middle | POS mobile with interest | 365243.0 | -42.0 | 300.0 | -42.0 | -37.0 | 0.0 |
| 1 | 2802425 | 108129 | Cash loans | 25188.615 | 607500.0 | 679671.0 | NaN | 607500.0 | THURSDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -164 | XNA | XAP | Unaccompanied | Repeater | XNA | Cash | x-sell | Contact center | -1 | XNA | 36.0 | low_action | Cash X-Sell: low | 365243.0 | -134.0 | 916.0 | 365243.0 | 365243.0 | 1.0 |
| 2 | 2523466 | 122040 | Cash loans | 15060.735 | 112500.0 | 136444.5 | NaN | 112500.0 | TUESDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -301 | Cash through the bank | XAP | Spouse, partner | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | high | Cash X-Sell: high | 365243.0 | -271.0 | 59.0 | 365243.0 | 365243.0 | 1.0 |
| 3 | 2819243 | 176158 | Cash loans | 47041.335 | 450000.0 | 470790.0 | NaN | 450000.0 | MONDAY | 7 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -512 | Cash through the bank | XAP | NaN | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | middle | Cash X-Sell: middle | 365243.0 | -482.0 | -152.0 | -182.0 | -177.0 | 1.0 |
| 4 | 1784265 | 202054 | Cash loans | 31924.395 | 337500.0 | 404055.0 | NaN | 337500.0 | THURSDAY | 9 | Y | 1 | NaN | NaN | NaN | Repairs | Refused | -781 | Cash through the bank | HC | NaN | Repeater | XNA | Cash | walk-in | Credit and cash offices | -1 | XNA | 24.0 | high | Cash Street: high | NaN | NaN | NaN | NaN | NaN | NaN |
prev_data['RATE_INTEREST_PRIMARY'].notnull().sum()
5951
prev_data['NAME_CONTRACT_STATUS'].value_counts()
Approved 1036781 Canceled 316319 Refused 290678 Unused offer 26436 Name: NAME_CONTRACT_STATUS, dtype: int64
prev_data['CODE_REJECT_REASON'].value_counts()
XAP 1353093 HC 175231 LIMIT 55680 SCO 37467 CLIENT 26436 SCOFR 12811 XNA 5244 VERIF 3535 SYSTEM 717 Name: CODE_REJECT_REASON, dtype: int64
prev_data['EXTRA_AMT_PAID'] = prev_data['CNT_PAYMENT']*prev_data['AMT_ANNUITY'] - prev_data['AMT_CREDIT']
prev_data['Amount_credit_accepted']=prev_data['AMT_CREDIT']
prev_data.loc[prev_data['CODE_REJECT_REASON']!='XAP','Amount_credit_accepted']=0
prev_data['Amount_credit_rejected']=prev_data['AMT_CREDIT']
prev_data.loc[prev_data['CODE_REJECT_REASON']=='XAP','Amount_credit_rejected']=0
prev_data['PREV_APP_XAP']=((prev_data['CODE_REJECT_REASON']=='XAP')).astype(int)
prev_data, prev_data_cat_columns, _=one_hot_encoding_dataframe(prev_data)
#remove the outlier 365243
for col in prev_data.columns:
if col.startswith('DAYS'):
prev_data[col].replace(365243, np.nan, inplace= True)
#Aggregation to apply
prev_data_agg={}
for col in prev_data.columns:
if col!='SK_ID_CURR':
prev_data_agg[col]=['mean']
#Applying Aggregation
prev_agg = prev_data.groupby('SK_ID_CURR').agg(prev_data_agg)
prev_agg.head()
| SK_ID_PREV | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | HOUR_APPR_PROCESS_START | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | DAYS_DECISION | SELLERPLACE_AREA | CNT_PAYMENT | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | EXTRA_AMT_PAID | Amount_credit_accepted | Amount_credit_rejected | PREV_APP_XAP | NAME_CONTRACT_TYPE_Cash loans | NAME_CONTRACT_TYPE_Consumer loans | NAME_CONTRACT_TYPE_Revolving loans | NAME_CONTRACT_TYPE_XNA | WEEKDAY_APPR_PROCESS_START_FRIDAY | WEEKDAY_APPR_PROCESS_START_MONDAY | WEEKDAY_APPR_PROCESS_START_SATURDAY | WEEKDAY_APPR_PROCESS_START_SUNDAY | WEEKDAY_APPR_PROCESS_START_THURSDAY | WEEKDAY_APPR_PROCESS_START_TUESDAY | WEEKDAY_APPR_PROCESS_START_WEDNESDAY | FLAG_LAST_APPL_PER_CONTRACT_N | FLAG_LAST_APPL_PER_CONTRACT_Y | NAME_CASH_LOAN_PURPOSE_Building a house or an annex | NAME_CASH_LOAN_PURPOSE_Business development | NAME_CASH_LOAN_PURPOSE_Buying a garage | NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land | NAME_CASH_LOAN_PURPOSE_Buying a home | NAME_CASH_LOAN_PURPOSE_Buying a new car | NAME_CASH_LOAN_PURPOSE_Buying a used car | NAME_CASH_LOAN_PURPOSE_Car repairs | NAME_CASH_LOAN_PURPOSE_Education | NAME_CASH_LOAN_PURPOSE_Everyday expenses | NAME_CASH_LOAN_PURPOSE_Furniture | NAME_CASH_LOAN_PURPOSE_Gasification / water supply | NAME_CASH_LOAN_PURPOSE_Hobby | NAME_CASH_LOAN_PURPOSE_Journey | NAME_CASH_LOAN_PURPOSE_Medicine | NAME_CASH_LOAN_PURPOSE_Money for a third person | NAME_CASH_LOAN_PURPOSE_Other | NAME_CASH_LOAN_PURPOSE_Payments on other loans | NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment | NAME_CASH_LOAN_PURPOSE_Refusal to name the goal | NAME_CASH_LOAN_PURPOSE_Repairs | NAME_CASH_LOAN_PURPOSE_Urgent needs | NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday | NAME_CASH_LOAN_PURPOSE_XAP | NAME_CASH_LOAN_PURPOSE_XNA | NAME_CONTRACT_STATUS_Approved | NAME_CONTRACT_STATUS_Canceled | NAME_CONTRACT_STATUS_Refused | NAME_CONTRACT_STATUS_Unused offer | NAME_PAYMENT_TYPE_Cash through the bank | NAME_PAYMENT_TYPE_Cashless from the account of the employer | NAME_PAYMENT_TYPE_Non-cash from your account | NAME_PAYMENT_TYPE_XNA | CODE_REJECT_REASON_CLIENT | CODE_REJECT_REASON_HC | CODE_REJECT_REASON_LIMIT | CODE_REJECT_REASON_SCO | CODE_REJECT_REASON_SCOFR | CODE_REJECT_REASON_SYSTEM | CODE_REJECT_REASON_VERIF | CODE_REJECT_REASON_XAP | CODE_REJECT_REASON_XNA | NAME_TYPE_SUITE_Children | NAME_TYPE_SUITE_Family | NAME_TYPE_SUITE_Group of people | NAME_TYPE_SUITE_Other_A | NAME_TYPE_SUITE_Other_B | NAME_TYPE_SUITE_Spouse, partner | NAME_TYPE_SUITE_Unaccompanied | NAME_CLIENT_TYPE_New | NAME_CLIENT_TYPE_Refreshed | NAME_CLIENT_TYPE_Repeater | NAME_CLIENT_TYPE_XNA | NAME_GOODS_CATEGORY_Additional Service | NAME_GOODS_CATEGORY_Animals | NAME_GOODS_CATEGORY_Audio/Video | NAME_GOODS_CATEGORY_Auto Accessories | NAME_GOODS_CATEGORY_Clothing and Accessories | NAME_GOODS_CATEGORY_Computers | NAME_GOODS_CATEGORY_Construction Materials | NAME_GOODS_CATEGORY_Consumer Electronics | NAME_GOODS_CATEGORY_Direct Sales | NAME_GOODS_CATEGORY_Education | NAME_GOODS_CATEGORY_Fitness | NAME_GOODS_CATEGORY_Furniture | NAME_GOODS_CATEGORY_Gardening | NAME_GOODS_CATEGORY_Homewares | NAME_GOODS_CATEGORY_House Construction | NAME_GOODS_CATEGORY_Insurance | NAME_GOODS_CATEGORY_Jewelry | NAME_GOODS_CATEGORY_Medical Supplies | NAME_GOODS_CATEGORY_Medicine | NAME_GOODS_CATEGORY_Mobile | NAME_GOODS_CATEGORY_Office Appliances | NAME_GOODS_CATEGORY_Other | NAME_GOODS_CATEGORY_Photo / Cinema Equipment | NAME_GOODS_CATEGORY_Sport and Leisure | NAME_GOODS_CATEGORY_Tourism | NAME_GOODS_CATEGORY_Vehicles | NAME_GOODS_CATEGORY_Weapon | NAME_GOODS_CATEGORY_XNA | NAME_PORTFOLIO_Cards | NAME_PORTFOLIO_Cars | NAME_PORTFOLIO_Cash | NAME_PORTFOLIO_POS | NAME_PORTFOLIO_XNA | NAME_PRODUCT_TYPE_XNA | NAME_PRODUCT_TYPE_walk-in | NAME_PRODUCT_TYPE_x-sell | CHANNEL_TYPE_AP+ (Cash loan) | CHANNEL_TYPE_Car dealer | CHANNEL_TYPE_Channel of corporate sales | CHANNEL_TYPE_Contact center | CHANNEL_TYPE_Country-wide | CHANNEL_TYPE_Credit and cash offices | CHANNEL_TYPE_Regional / Local | CHANNEL_TYPE_Stone | NAME_SELLER_INDUSTRY_Auto technology | NAME_SELLER_INDUSTRY_Clothing | NAME_SELLER_INDUSTRY_Connectivity | NAME_SELLER_INDUSTRY_Construction | NAME_SELLER_INDUSTRY_Consumer electronics | NAME_SELLER_INDUSTRY_Furniture | NAME_SELLER_INDUSTRY_Industry | NAME_SELLER_INDUSTRY_Jewelry | NAME_SELLER_INDUSTRY_MLM partners | NAME_SELLER_INDUSTRY_Tourism | NAME_SELLER_INDUSTRY_XNA | NAME_YIELD_GROUP_XNA | NAME_YIELD_GROUP_high | NAME_YIELD_GROUP_low_action | NAME_YIELD_GROUP_low_normal | NAME_YIELD_GROUP_middle | PRODUCT_COMBINATION_Card Street | PRODUCT_COMBINATION_Card X-Sell | PRODUCT_COMBINATION_Cash | PRODUCT_COMBINATION_Cash Street: high | PRODUCT_COMBINATION_Cash Street: low | PRODUCT_COMBINATION_Cash Street: middle | PRODUCT_COMBINATION_Cash X-Sell: high | PRODUCT_COMBINATION_Cash X-Sell: low | PRODUCT_COMBINATION_Cash X-Sell: middle | PRODUCT_COMBINATION_POS household with interest | PRODUCT_COMBINATION_POS household without interest | PRODUCT_COMBINATION_POS industry with interest | PRODUCT_COMBINATION_POS industry without interest | PRODUCT_COMBINATION_POS mobile with interest | PRODUCT_COMBINATION_POS mobile without interest | PRODUCT_COMBINATION_POS other with interest | PRODUCT_COMBINATION_POS others without interest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | |
| SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100001 | 1.369693e+06 | 3951.000 | 24835.50 | 23787.00 | 2520.0 | 24835.5 | 13.000000 | 1.0 | 0.104326 | NaN | NaN | -1740.0 | 23.0 | 8.0 | NaN | -1709.000000 | -1499.000000 | -1619.000000 | -1612.000000 | 0.000000 | 7821.00 | 23787.00 | 0.0 | 1.0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 1.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 100002 | 1.038818e+06 | 9251.775 | 179055.00 | 179055.00 | 0.0 | 179055.0 | 9.000000 | 1.0 | 0.000000 | NaN | NaN | -606.0 | 500.0 | 24.0 | NaN | -565.000000 | 125.000000 | -25.000000 | -17.000000 | 0.000000 | 42987.60 | 179055.00 | 0.0 | 1.0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 1.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 100003 | 2.281150e+06 | 56553.990 | 435436.50 | 484191.00 | 3442.5 | 435436.5 | 14.666667 | 1.0 | 0.050030 | NaN | NaN | -1305.0 | 533.0 | 10.0 | NaN | -1274.333333 | -1004.333333 | -1054.333333 | -1047.333333 | 0.666667 | 65321.55 | 484191.00 | 0.0 | 1.0 | 0.333333 | 0.666667 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.333333 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.666667 | 0.333333 | 1.0 | 0.0 | 0.0 | 0.0 | 0.666667 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.666667 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.666667 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.333333 | 0.666667 | 0.0 | 0.666667 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.666667 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100004 | 1.564014e+06 | 5357.250 | 24282.00 | 20106.00 | 4860.0 | 24282.0 | 5.000000 | 1.0 | 0.212008 | NaN | NaN | -815.0 | 30.0 | 4.0 | NaN | -784.000000 | -694.000000 | -724.000000 | -714.000000 | 0.000000 | 1323.00 | 20106.00 | 0.0 | 1.0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 1.0 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 100005 | 2.176837e+06 | 4813.200 | 22308.75 | 20076.75 | 4464.0 | 44617.5 | 10.500000 | 1.0 | 0.108964 | NaN | NaN | -536.0 | 18.0 | 12.0 | NaN | -706.000000 | -376.000000 | -466.000000 | -460.000000 | 0.000000 | 17604.90 | 20076.75 | 0.0 | 1.0 | 0.500000 | 0.500000 | 0.0 | 0.0 | 0.500000 | 0.0 | 0.000000 | 0.000000 | 0.5 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.500000 | 0.5 | 0.5 | 0.0 | 0.0 | 0.500000 | 0.0 | 0.0 | 0.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.5 | 0.000000 | 0.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.0 | 0.0 | 0.000000 | 0.500000 | 0.5 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.500000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.5 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.5 | 0.5 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 |
# unstack columns name
modified_col=[]
for c in list(prev_agg.columns):
modified_col.append("PREV_"+c[0]+"_"+c[1].upper())
prev_agg.columns=modified_col
prev_agg.head()
| PREV_SK_ID_PREV_MEAN | PREV_AMT_ANNUITY_MEAN | PREV_AMT_APPLICATION_MEAN | PREV_AMT_CREDIT_MEAN | PREV_AMT_DOWN_PAYMENT_MEAN | PREV_AMT_GOODS_PRICE_MEAN | PREV_HOUR_APPR_PROCESS_START_MEAN | PREV_NFLAG_LAST_APPL_IN_DAY_MEAN | PREV_RATE_DOWN_PAYMENT_MEAN | PREV_RATE_INTEREST_PRIMARY_MEAN | PREV_RATE_INTEREST_PRIVILEGED_MEAN | PREV_DAYS_DECISION_MEAN | PREV_SELLERPLACE_AREA_MEAN | PREV_CNT_PAYMENT_MEAN | PREV_DAYS_FIRST_DRAWING_MEAN | PREV_DAYS_FIRST_DUE_MEAN | PREV_DAYS_LAST_DUE_1ST_VERSION_MEAN | PREV_DAYS_LAST_DUE_MEAN | PREV_DAYS_TERMINATION_MEAN | PREV_NFLAG_INSURED_ON_APPROVAL_MEAN | PREV_EXTRA_AMT_PAID_MEAN | PREV_Amount_credit_accepted_MEAN | PREV_Amount_credit_rejected_MEAN | PREV_PREV_APP_XAP_MEAN | PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN | PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN | PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN | PREV_NAME_CONTRACT_TYPE_XNA_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_SATURDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_SUNDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_TUESDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_N_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_Y_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Building a house or an annex_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Business development_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a garage_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a home_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a new car_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a used car_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Car repairs_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Education_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Everyday expenses_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Furniture_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Gasification / water supply_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Hobby_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Journey_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Medicine_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Money for a third person_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Other_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Payments on other loans_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Refusal to name the goal_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Repairs_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Urgent needs_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_XAP_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_XNA_MEAN | PREV_NAME_CONTRACT_STATUS_Approved_MEAN | PREV_NAME_CONTRACT_STATUS_Canceled_MEAN | PREV_NAME_CONTRACT_STATUS_Refused_MEAN | PREV_NAME_CONTRACT_STATUS_Unused offer_MEAN | PREV_NAME_PAYMENT_TYPE_Cash through the bank_MEAN | PREV_NAME_PAYMENT_TYPE_Cashless from the account of the employer_MEAN | PREV_NAME_PAYMENT_TYPE_Non-cash from your account_MEAN | PREV_NAME_PAYMENT_TYPE_XNA_MEAN | PREV_CODE_REJECT_REASON_CLIENT_MEAN | PREV_CODE_REJECT_REASON_HC_MEAN | PREV_CODE_REJECT_REASON_LIMIT_MEAN | PREV_CODE_REJECT_REASON_SCO_MEAN | PREV_CODE_REJECT_REASON_SCOFR_MEAN | PREV_CODE_REJECT_REASON_SYSTEM_MEAN | PREV_CODE_REJECT_REASON_VERIF_MEAN | PREV_CODE_REJECT_REASON_XAP_MEAN | PREV_CODE_REJECT_REASON_XNA_MEAN | PREV_NAME_TYPE_SUITE_Children_MEAN | PREV_NAME_TYPE_SUITE_Family_MEAN | PREV_NAME_TYPE_SUITE_Group of people_MEAN | PREV_NAME_TYPE_SUITE_Other_A_MEAN | PREV_NAME_TYPE_SUITE_Other_B_MEAN | PREV_NAME_TYPE_SUITE_Spouse, partner_MEAN | PREV_NAME_TYPE_SUITE_Unaccompanied_MEAN | PREV_NAME_CLIENT_TYPE_New_MEAN | PREV_NAME_CLIENT_TYPE_Refreshed_MEAN | PREV_NAME_CLIENT_TYPE_Repeater_MEAN | PREV_NAME_CLIENT_TYPE_XNA_MEAN | PREV_NAME_GOODS_CATEGORY_Additional Service_MEAN | PREV_NAME_GOODS_CATEGORY_Animals_MEAN | PREV_NAME_GOODS_CATEGORY_Audio/Video_MEAN | PREV_NAME_GOODS_CATEGORY_Auto Accessories_MEAN | PREV_NAME_GOODS_CATEGORY_Clothing and Accessories_MEAN | PREV_NAME_GOODS_CATEGORY_Computers_MEAN | PREV_NAME_GOODS_CATEGORY_Construction Materials_MEAN | PREV_NAME_GOODS_CATEGORY_Consumer Electronics_MEAN | PREV_NAME_GOODS_CATEGORY_Direct Sales_MEAN | PREV_NAME_GOODS_CATEGORY_Education_MEAN | PREV_NAME_GOODS_CATEGORY_Fitness_MEAN | PREV_NAME_GOODS_CATEGORY_Furniture_MEAN | PREV_NAME_GOODS_CATEGORY_Gardening_MEAN | PREV_NAME_GOODS_CATEGORY_Homewares_MEAN | PREV_NAME_GOODS_CATEGORY_House Construction_MEAN | PREV_NAME_GOODS_CATEGORY_Insurance_MEAN | PREV_NAME_GOODS_CATEGORY_Jewelry_MEAN | PREV_NAME_GOODS_CATEGORY_Medical Supplies_MEAN | PREV_NAME_GOODS_CATEGORY_Medicine_MEAN | PREV_NAME_GOODS_CATEGORY_Mobile_MEAN | PREV_NAME_GOODS_CATEGORY_Office Appliances_MEAN | PREV_NAME_GOODS_CATEGORY_Other_MEAN | PREV_NAME_GOODS_CATEGORY_Photo / Cinema Equipment_MEAN | PREV_NAME_GOODS_CATEGORY_Sport and Leisure_MEAN | PREV_NAME_GOODS_CATEGORY_Tourism_MEAN | PREV_NAME_GOODS_CATEGORY_Vehicles_MEAN | PREV_NAME_GOODS_CATEGORY_Weapon_MEAN | PREV_NAME_GOODS_CATEGORY_XNA_MEAN | PREV_NAME_PORTFOLIO_Cards_MEAN | PREV_NAME_PORTFOLIO_Cars_MEAN | PREV_NAME_PORTFOLIO_Cash_MEAN | PREV_NAME_PORTFOLIO_POS_MEAN | PREV_NAME_PORTFOLIO_XNA_MEAN | PREV_NAME_PRODUCT_TYPE_XNA_MEAN | PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | PREV_NAME_PRODUCT_TYPE_x-sell_MEAN | PREV_CHANNEL_TYPE_AP+ (Cash loan)_MEAN | PREV_CHANNEL_TYPE_Car dealer_MEAN | PREV_CHANNEL_TYPE_Channel of corporate sales_MEAN | PREV_CHANNEL_TYPE_Contact center_MEAN | PREV_CHANNEL_TYPE_Country-wide_MEAN | PREV_CHANNEL_TYPE_Credit and cash offices_MEAN | PREV_CHANNEL_TYPE_Regional / Local_MEAN | PREV_CHANNEL_TYPE_Stone_MEAN | PREV_NAME_SELLER_INDUSTRY_Auto technology_MEAN | PREV_NAME_SELLER_INDUSTRY_Clothing_MEAN | PREV_NAME_SELLER_INDUSTRY_Connectivity_MEAN | PREV_NAME_SELLER_INDUSTRY_Construction_MEAN | PREV_NAME_SELLER_INDUSTRY_Consumer electronics_MEAN | PREV_NAME_SELLER_INDUSTRY_Furniture_MEAN | PREV_NAME_SELLER_INDUSTRY_Industry_MEAN | PREV_NAME_SELLER_INDUSTRY_Jewelry_MEAN | PREV_NAME_SELLER_INDUSTRY_MLM partners_MEAN | PREV_NAME_SELLER_INDUSTRY_Tourism_MEAN | PREV_NAME_SELLER_INDUSTRY_XNA_MEAN | PREV_NAME_YIELD_GROUP_XNA_MEAN | PREV_NAME_YIELD_GROUP_high_MEAN | PREV_NAME_YIELD_GROUP_low_action_MEAN | PREV_NAME_YIELD_GROUP_low_normal_MEAN | PREV_NAME_YIELD_GROUP_middle_MEAN | PREV_PRODUCT_COMBINATION_Card Street_MEAN | PREV_PRODUCT_COMBINATION_Card X-Sell_MEAN | PREV_PRODUCT_COMBINATION_Cash_MEAN | PREV_PRODUCT_COMBINATION_Cash Street: high_MEAN | PREV_PRODUCT_COMBINATION_Cash Street: low_MEAN | PREV_PRODUCT_COMBINATION_Cash Street: middle_MEAN | PREV_PRODUCT_COMBINATION_Cash X-Sell: high_MEAN | PREV_PRODUCT_COMBINATION_Cash X-Sell: low_MEAN | PREV_PRODUCT_COMBINATION_Cash X-Sell: middle_MEAN | PREV_PRODUCT_COMBINATION_POS household with interest_MEAN | PREV_PRODUCT_COMBINATION_POS household without interest_MEAN | PREV_PRODUCT_COMBINATION_POS industry with interest_MEAN | PREV_PRODUCT_COMBINATION_POS industry without interest_MEAN | PREV_PRODUCT_COMBINATION_POS mobile with interest_MEAN | PREV_PRODUCT_COMBINATION_POS mobile without interest_MEAN | PREV_PRODUCT_COMBINATION_POS other with interest_MEAN | PREV_PRODUCT_COMBINATION_POS others without interest_MEAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100001 | 1.369693e+06 | 3951.000 | 24835.50 | 23787.00 | 2520.0 | 24835.5 | 13.000000 | 1.0 | 0.104326 | NaN | NaN | -1740.0 | 23.0 | 8.0 | NaN | -1709.000000 | -1499.000000 | -1619.000000 | -1612.000000 | 0.000000 | 7821.00 | 23787.00 | 0.0 | 1.0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 1.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 100002 | 1.038818e+06 | 9251.775 | 179055.00 | 179055.00 | 0.0 | 179055.0 | 9.000000 | 1.0 | 0.000000 | NaN | NaN | -606.0 | 500.0 | 24.0 | NaN | -565.000000 | 125.000000 | -25.000000 | -17.000000 | 0.000000 | 42987.60 | 179055.00 | 0.0 | 1.0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 1.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 100003 | 2.281150e+06 | 56553.990 | 435436.50 | 484191.00 | 3442.5 | 435436.5 | 14.666667 | 1.0 | 0.050030 | NaN | NaN | -1305.0 | 533.0 | 10.0 | NaN | -1274.333333 | -1004.333333 | -1054.333333 | -1047.333333 | 0.666667 | 65321.55 | 484191.00 | 0.0 | 1.0 | 0.333333 | 0.666667 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.333333 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.666667 | 0.333333 | 1.0 | 0.0 | 0.0 | 0.0 | 0.666667 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.666667 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.666667 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.333333 | 0.666667 | 0.0 | 0.666667 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.666667 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100004 | 1.564014e+06 | 5357.250 | 24282.00 | 20106.00 | 4860.0 | 24282.0 | 5.000000 | 1.0 | 0.212008 | NaN | NaN | -815.0 | 30.0 | 4.0 | NaN | -784.000000 | -694.000000 | -724.000000 | -714.000000 | 0.000000 | 1323.00 | 20106.00 | 0.0 | 1.0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 1.0 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 100005 | 2.176837e+06 | 4813.200 | 22308.75 | 20076.75 | 4464.0 | 44617.5 | 10.500000 | 1.0 | 0.108964 | NaN | NaN | -536.0 | 18.0 | 12.0 | NaN | -706.000000 | -376.000000 | -466.000000 | -460.000000 | 0.000000 | 17604.90 | 20076.75 | 0.0 | 1.0 | 0.500000 | 0.500000 | 0.0 | 0.0 | 0.500000 | 0.0 | 0.000000 | 0.000000 | 0.5 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.500000 | 0.5 | 0.5 | 0.0 | 0.0 | 0.500000 | 0.0 | 0.0 | 0.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.5 | 0.000000 | 0.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.0 | 0.0 | 0.000000 | 0.500000 | 0.5 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.500000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.5 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.5 | 0.5 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 |
df_prev = prev_agg.join(app_train.TARGET, how='inner', on='SK_ID_CURR')
df_prev.name='df_prev'
df_prev.head()
| PREV_SK_ID_PREV_MEAN | PREV_AMT_ANNUITY_MEAN | PREV_AMT_APPLICATION_MEAN | PREV_AMT_CREDIT_MEAN | PREV_AMT_DOWN_PAYMENT_MEAN | PREV_AMT_GOODS_PRICE_MEAN | PREV_HOUR_APPR_PROCESS_START_MEAN | PREV_NFLAG_LAST_APPL_IN_DAY_MEAN | PREV_RATE_DOWN_PAYMENT_MEAN | PREV_RATE_INTEREST_PRIMARY_MEAN | PREV_RATE_INTEREST_PRIVILEGED_MEAN | PREV_DAYS_DECISION_MEAN | PREV_SELLERPLACE_AREA_MEAN | PREV_CNT_PAYMENT_MEAN | PREV_DAYS_FIRST_DRAWING_MEAN | PREV_DAYS_FIRST_DUE_MEAN | PREV_DAYS_LAST_DUE_1ST_VERSION_MEAN | PREV_DAYS_LAST_DUE_MEAN | PREV_DAYS_TERMINATION_MEAN | PREV_NFLAG_INSURED_ON_APPROVAL_MEAN | PREV_EXTRA_AMT_PAID_MEAN | PREV_Amount_credit_accepted_MEAN | PREV_Amount_credit_rejected_MEAN | PREV_PREV_APP_XAP_MEAN | PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN | PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN | PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN | PREV_NAME_CONTRACT_TYPE_XNA_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_SATURDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_SUNDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_TUESDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_N_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_Y_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Building a house or an annex_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Business development_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a garage_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a home_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a new car_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a used car_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Car repairs_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Education_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Everyday expenses_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Furniture_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Gasification / water supply_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Hobby_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Journey_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Medicine_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Money for a third person_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Other_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Payments on other loans_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Refusal to name the goal_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Repairs_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Urgent needs_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_XAP_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_XNA_MEAN | PREV_NAME_CONTRACT_STATUS_Approved_MEAN | PREV_NAME_CONTRACT_STATUS_Canceled_MEAN | PREV_NAME_CONTRACT_STATUS_Refused_MEAN | PREV_NAME_CONTRACT_STATUS_Unused offer_MEAN | PREV_NAME_PAYMENT_TYPE_Cash through the bank_MEAN | PREV_NAME_PAYMENT_TYPE_Cashless from the account of the employer_MEAN | PREV_NAME_PAYMENT_TYPE_Non-cash from your account_MEAN | PREV_NAME_PAYMENT_TYPE_XNA_MEAN | PREV_CODE_REJECT_REASON_CLIENT_MEAN | PREV_CODE_REJECT_REASON_HC_MEAN | PREV_CODE_REJECT_REASON_LIMIT_MEAN | PREV_CODE_REJECT_REASON_SCO_MEAN | PREV_CODE_REJECT_REASON_SCOFR_MEAN | PREV_CODE_REJECT_REASON_SYSTEM_MEAN | PREV_CODE_REJECT_REASON_VERIF_MEAN | PREV_CODE_REJECT_REASON_XAP_MEAN | PREV_CODE_REJECT_REASON_XNA_MEAN | PREV_NAME_TYPE_SUITE_Children_MEAN | PREV_NAME_TYPE_SUITE_Family_MEAN | PREV_NAME_TYPE_SUITE_Group of people_MEAN | PREV_NAME_TYPE_SUITE_Other_A_MEAN | PREV_NAME_TYPE_SUITE_Other_B_MEAN | PREV_NAME_TYPE_SUITE_Spouse, partner_MEAN | PREV_NAME_TYPE_SUITE_Unaccompanied_MEAN | PREV_NAME_CLIENT_TYPE_New_MEAN | PREV_NAME_CLIENT_TYPE_Refreshed_MEAN | PREV_NAME_CLIENT_TYPE_Repeater_MEAN | PREV_NAME_CLIENT_TYPE_XNA_MEAN | PREV_NAME_GOODS_CATEGORY_Additional Service_MEAN | PREV_NAME_GOODS_CATEGORY_Animals_MEAN | PREV_NAME_GOODS_CATEGORY_Audio/Video_MEAN | PREV_NAME_GOODS_CATEGORY_Auto Accessories_MEAN | PREV_NAME_GOODS_CATEGORY_Clothing and Accessories_MEAN | PREV_NAME_GOODS_CATEGORY_Computers_MEAN | PREV_NAME_GOODS_CATEGORY_Construction Materials_MEAN | PREV_NAME_GOODS_CATEGORY_Consumer Electronics_MEAN | PREV_NAME_GOODS_CATEGORY_Direct Sales_MEAN | PREV_NAME_GOODS_CATEGORY_Education_MEAN | PREV_NAME_GOODS_CATEGORY_Fitness_MEAN | PREV_NAME_GOODS_CATEGORY_Furniture_MEAN | PREV_NAME_GOODS_CATEGORY_Gardening_MEAN | PREV_NAME_GOODS_CATEGORY_Homewares_MEAN | PREV_NAME_GOODS_CATEGORY_House Construction_MEAN | PREV_NAME_GOODS_CATEGORY_Insurance_MEAN | PREV_NAME_GOODS_CATEGORY_Jewelry_MEAN | PREV_NAME_GOODS_CATEGORY_Medical Supplies_MEAN | PREV_NAME_GOODS_CATEGORY_Medicine_MEAN | PREV_NAME_GOODS_CATEGORY_Mobile_MEAN | PREV_NAME_GOODS_CATEGORY_Office Appliances_MEAN | PREV_NAME_GOODS_CATEGORY_Other_MEAN | PREV_NAME_GOODS_CATEGORY_Photo / Cinema Equipment_MEAN | PREV_NAME_GOODS_CATEGORY_Sport and Leisure_MEAN | PREV_NAME_GOODS_CATEGORY_Tourism_MEAN | PREV_NAME_GOODS_CATEGORY_Vehicles_MEAN | PREV_NAME_GOODS_CATEGORY_Weapon_MEAN | PREV_NAME_GOODS_CATEGORY_XNA_MEAN | PREV_NAME_PORTFOLIO_Cards_MEAN | PREV_NAME_PORTFOLIO_Cars_MEAN | PREV_NAME_PORTFOLIO_Cash_MEAN | PREV_NAME_PORTFOLIO_POS_MEAN | PREV_NAME_PORTFOLIO_XNA_MEAN | PREV_NAME_PRODUCT_TYPE_XNA_MEAN | PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | PREV_NAME_PRODUCT_TYPE_x-sell_MEAN | PREV_CHANNEL_TYPE_AP+ (Cash loan)_MEAN | PREV_CHANNEL_TYPE_Car dealer_MEAN | PREV_CHANNEL_TYPE_Channel of corporate sales_MEAN | PREV_CHANNEL_TYPE_Contact center_MEAN | PREV_CHANNEL_TYPE_Country-wide_MEAN | PREV_CHANNEL_TYPE_Credit and cash offices_MEAN | PREV_CHANNEL_TYPE_Regional / Local_MEAN | PREV_CHANNEL_TYPE_Stone_MEAN | PREV_NAME_SELLER_INDUSTRY_Auto technology_MEAN | PREV_NAME_SELLER_INDUSTRY_Clothing_MEAN | PREV_NAME_SELLER_INDUSTRY_Connectivity_MEAN | PREV_NAME_SELLER_INDUSTRY_Construction_MEAN | PREV_NAME_SELLER_INDUSTRY_Consumer electronics_MEAN | PREV_NAME_SELLER_INDUSTRY_Furniture_MEAN | PREV_NAME_SELLER_INDUSTRY_Industry_MEAN | PREV_NAME_SELLER_INDUSTRY_Jewelry_MEAN | PREV_NAME_SELLER_INDUSTRY_MLM partners_MEAN | PREV_NAME_SELLER_INDUSTRY_Tourism_MEAN | PREV_NAME_SELLER_INDUSTRY_XNA_MEAN | PREV_NAME_YIELD_GROUP_XNA_MEAN | PREV_NAME_YIELD_GROUP_high_MEAN | PREV_NAME_YIELD_GROUP_low_action_MEAN | PREV_NAME_YIELD_GROUP_low_normal_MEAN | PREV_NAME_YIELD_GROUP_middle_MEAN | PREV_PRODUCT_COMBINATION_Card Street_MEAN | PREV_PRODUCT_COMBINATION_Card X-Sell_MEAN | PREV_PRODUCT_COMBINATION_Cash_MEAN | PREV_PRODUCT_COMBINATION_Cash Street: high_MEAN | PREV_PRODUCT_COMBINATION_Cash Street: low_MEAN | PREV_PRODUCT_COMBINATION_Cash Street: middle_MEAN | PREV_PRODUCT_COMBINATION_Cash X-Sell: high_MEAN | PREV_PRODUCT_COMBINATION_Cash X-Sell: low_MEAN | PREV_PRODUCT_COMBINATION_Cash X-Sell: middle_MEAN | PREV_PRODUCT_COMBINATION_POS household with interest_MEAN | PREV_PRODUCT_COMBINATION_POS household without interest_MEAN | PREV_PRODUCT_COMBINATION_POS industry with interest_MEAN | PREV_PRODUCT_COMBINATION_POS industry without interest_MEAN | PREV_PRODUCT_COMBINATION_POS mobile with interest_MEAN | PREV_PRODUCT_COMBINATION_POS mobile without interest_MEAN | PREV_PRODUCT_COMBINATION_POS other with interest_MEAN | PREV_PRODUCT_COMBINATION_POS others without interest_MEAN | TARGET | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100002 | 1.038818e+06 | 9251.775 | 179055.00 | 179055.00 | 0.00 | 179055.00 | 9.000000 | 1.0 | 0.000000 | NaN | NaN | -606.000000 | 500.000000 | 24.000000 | NaN | -565.000000 | 125.000000 | -25.000000 | -17.000000 | 0.000000 | 42987.60 | 179055.00 | 0.0 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 1.0 | 0.0 | 1 |
| 100003 | 2.281150e+06 | 56553.990 | 435436.50 | 484191.00 | 3442.50 | 435436.50 | 14.666667 | 1.0 | 0.050030 | NaN | NaN | -1305.000000 | 533.000000 | 10.000000 | NaN | -1274.333333 | -1004.333333 | -1054.333333 | -1047.333333 | 0.666667 | 65321.55 | 484191.00 | 0.0 | 1.000000 | 0.333333 | 0.666667 | 0.000000 | 0.0 | 0.333333 | 0.000000 | 0.333333 | 0.333333 | 0.000000 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.666667 | 0.333333 | 1.000000 | 0.000000 | 0.000000 | 0.0 | 0.666667 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.666667 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.000000 | 0.666667 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.000000 | 0.0 | 0.333333 | 0.666667 | 0.000000 | 0.666667 | 0.000000 | 0.333333 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.333333 | 0.000000 | 0.333333 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.333333 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.000000 | 0.000000 | 0.0 | 0.333333 | 0.666667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0 |
| 100004 | 1.564014e+06 | 5357.250 | 24282.00 | 20106.00 | 4860.00 | 24282.00 | 5.000000 | 1.0 | 0.212008 | NaN | NaN | -815.000000 | 30.000000 | 4.000000 | NaN | -784.000000 | -694.000000 | -724.000000 | -714.000000 | 0.000000 | 1323.00 | 20106.00 | 0.0 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 0.0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.0 | 0 |
| 100006 | 1.932462e+06 | 23651.175 | 272203.26 | 291695.50 | 34840.17 | 408304.89 | 14.666667 | 1.0 | 0.163412 | NaN | NaN | -272.444444 | 894.222222 | 23.000000 | NaN | -325.666667 | 364.333333 | -288.000000 | -279.500000 | 0.000000 | 200877.96 | 190960.50 | 100735.0 | 0.888889 | 0.555556 | 0.222222 | 0.222222 | 0.0 | 0.000000 | 0.000000 | 0.111111 | 0.111111 | 0.666667 | 0.111111 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.444444 | 0.555556 | 0.555556 | 0.333333 | 0.111111 | 0.0 | 0.444444 | 0.0 | 0.0 | 0.555556 | 0.0 | 0.0 | 0.111111 | 0.0 | 0.0 | 0.0 | 0.0 | 0.888889 | 0.0 | 0.0 | 0.111111 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.111111 | 0.000000 | 0.888889 | 0.0 | 0.0 | 0.0 | 0.111111 | 0.0 | 0.0 | 0.0 | 0.111111 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.777778 | 0.111111 | 0.0 | 0.333333 | 0.222222 | 0.333333 | 0.555556 | 0.000000 | 0.444444 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.111111 | 0.777778 | 0.000000 | 0.111111 | 0.0 | 0.0 | 0.000000 | 0.111111 | 0.111111 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.777778 | 0.444444 | 0.222222 | 0.0 | 0.222222 | 0.111111 | 0.111111 | 0.111111 | 0.222222 | 0.000000 | 0.0 | 0.0 | 0.111111 | 0.222222 | 0.0 | 0.111111 | 0.0 | 0.111111 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0 |
| 100007 | 2.157812e+06 | 12278.805 | 150530.25 | 166638.75 | 3390.75 | 150530.25 | 12.333333 | 1.0 | 0.159516 | NaN | NaN | -1222.833333 | 409.166667 | 20.666667 | NaN | -1263.200000 | -837.200000 | -1140.500000 | -1131.000000 | 0.600000 | 106746.27 | 166638.75 | 0.0 | 1.000000 | 0.666667 | 0.333333 | 0.000000 | 0.0 | 0.166667 | 0.166667 | 0.166667 | 0.333333 | 0.166667 | 0.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.666667 | 1.000000 | 0.000000 | 0.000000 | 0.0 | 0.833333 | 0.0 | 0.0 | 0.166667 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.166667 | 0.000000 | 0.833333 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.666667 | 0.000000 | 0.0 | 0.666667 | 0.333333 | 0.000000 | 0.333333 | 0.166667 | 0.500000 | 0.166667 | 0.0 | 0.0 | 0.0 | 0.500000 | 0.166667 | 0.166667 | 0.000000 | 0.0 | 0.0 | 0.166667 | 0.000000 | 0.500000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.000000 | 0.500000 | 0.0 | 0.000000 | 0.500000 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.5 | 0.166667 | 0.0 | 0.000000 | 0.0 | 0.166667 | 0.0 | 0.0 | 0.0 | 0 |
df_prev.shape
(291057, 168)
numerical_summary(df_prev)
Data Frame a 168 colonnes. Dont 168 colonnes contiennent des valeurs manquantes.
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| PREV_PREV_APP_XAP_MEAN | -0.07 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_XAP_MEAN | -0.07 | 0 | 0.00 |
| PREV_NAME_CONTRACT_STATUS_Approved_MEAN | -0.06 | 0 | 0.00 |
| PREV_Amount_credit_accepted_MEAN | -0.04 | 0 | 0.00 |
| PREV_NAME_YIELD_GROUP_low_normal_MEAN | -0.04 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_Cash X-Sell: low_MEAN | -0.04 | 0 | 0.00 |
| PREV_HOUR_APPR_PROCESS_START_MEAN | -0.04 | 0 | 0.00 |
| PREV_AMT_ANNUITY_MEAN | -0.03 | 417 | 0.14 |
| PREV_NAME_PAYMENT_TYPE_Cash through the bank_MEAN | -0.03 | 0 | 0.00 |
| PREV_NAME_PORTFOLIO_POS_MEAN | -0.03 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_POS industry with interest_MEAN | -0.03 | 0 | 0.00 |
| PREV_RATE_DOWN_PAYMENT_MEAN | -0.03 | 17452 | 6.00 |
| PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN | -0.03 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Furniture_MEAN | -0.03 | 0 | 0.00 |
| PREV_NAME_YIELD_GROUP_low_action_MEAN | -0.03 | 0 | 0.00 |
| PREV_NAME_SELLER_INDUSTRY_Furniture_MEAN | -0.03 | 0 | 0.00 |
| PREV_NAME_CLIENT_TYPE_Refreshed_MEAN | -0.03 | 0 | 0.00 |
| PREV_AMT_DOWN_PAYMENT_MEAN | -0.02 | 17452 | 6.00 |
| PREV_PRODUCT_COMBINATION_POS household without interest_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_TYPE_SUITE_Family_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_SELLER_INDUSTRY_Consumer electronics_MEAN | -0.02 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_POS industry without interest_MEAN | -0.02 | 0 | 0.00 |
| PREV_AMT_APPLICATION_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Clothing and Accessories_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_PRODUCT_TYPE_XNA_MEAN | -0.02 | 0 | 0.00 |
| PREV_CHANNEL_TYPE_Stone_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_SELLER_INDUSTRY_Clothing_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Consumer Electronics_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_PRODUCT_TYPE_x-sell_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_TYPE_SUITE_Children_MEAN | -0.02 | 0 | 0.00 |
| PREV_AMT_CREDIT_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_YIELD_GROUP_middle_MEAN | -0.02 | 0 | 0.00 |
| PREV_AMT_GOODS_PRICE_MEAN | -0.02 | 975 | 0.33 |
| PREV_PRODUCT_COMBINATION_Cash X-Sell: middle_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_SELLER_INDUSTRY_Construction_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_XAP_MEAN | -0.02 | 0 | 0.00 |
| PREV_NAME_TYPE_SUITE_Unaccompanied_MEAN | -0.01 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_POS household with interest_MEAN | -0.01 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Construction Materials_MEAN | -0.01 | 0 | 0.00 |
| PREV_WEEKDAY_APPR_PROCESS_START_SUNDAY_MEAN | -0.01 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Medical Supplies_MEAN | -0.01 | 0 | 0.00 |
| PREV_CHANNEL_TYPE_Regional / Local_MEAN | -0.01 | 0 | 0.00 |
| PREV_NAME_SELLER_INDUSTRY_Industry_MEAN | -0.01 | 0 | 0.00 |
| PREV_CHANNEL_TYPE_Channel of corporate sales_MEAN | -0.01 | 0 | 0.00 |
| PREV_FLAG_LAST_APPL_PER_CONTRACT_Y_MEAN | -0.01 | 0 | 0.00 |
| PREV_WEEKDAY_APPR_PROCESS_START_SATURDAY_MEAN | -0.01 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Audio/Video_MEAN | -0.01 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Tourism_MEAN | -0.01 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Homewares_MEAN | -0.01 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Medicine_MEAN | -0.01 | 0 | 0.00 |
| PREV_NAME_TYPE_SUITE_Spouse, partner_MEAN | -0.01 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Other_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_CLIENT_TYPE_Repeater_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_SELLER_INDUSTRY_Tourism_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Photo / Cinema Equipment_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Computers_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Gardening_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_SELLER_INDUSTRY_MLM partners_MEAN | -0.00 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_POS other with interest_MEAN | -0.00 | 0 | 0.00 |
| PREV_NFLAG_LAST_APPL_IN_DAY_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Fitness_MEAN | -0.00 | 0 | 0.00 |
| PREV_CHANNEL_TYPE_Car dealer_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_PORTFOLIO_Cars_MEAN | -0.00 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_POS mobile without interest_MEAN | -0.00 | 0 | 0.00 |
| PREV_SELLERPLACE_AREA_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Additional Service_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Education_MEAN | -0.00 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_POS others without interest_MEAN | -0.00 | 0 | 0.00 |
| PREV_CHANNEL_TYPE_Country-wide_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Weapon_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Sport and Leisure_MEAN | -0.00 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_SYSTEM_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Animals_MEAN | -0.00 | 0 | 0.00 |
| PREV_NFLAG_INSURED_ON_APPROVAL_MEAN | -0.00 | 1297 | 0.45 |
| PREV_NAME_CASH_LOAN_PURPOSE_XNA_MEAN | -0.00 | 0 | 0.00 |
| PREV_RATE_INTEREST_PRIMARY_MEAN | -0.00 | 286448 | 98.42 |
| PREV_NAME_GOODS_CATEGORY_Office Appliances_MEAN | -0.00 | 0 | 0.00 |
| PREV_NAME_PAYMENT_TYPE_Cashless from the account of the employer_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Money for a third person_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_PAYMENT_TYPE_Non-cash from your account_MEAN | 0.00 | 0 | 0.00 |
| PREV_WEEKDAY_APPR_PROCESS_START_TUESDAY_MEAN | 0.00 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_VERIF_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Buying a garage_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Buying a home_MEAN | 0.00 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_CLIENT_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CONTRACT_STATUS_Unused offer_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_TYPE_SUITE_Group of people_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Refusal to name the goal_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Direct Sales_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Furniture_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Business development_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Buying a new car_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Journey_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CLIENT_TYPE_XNA_MEAN | 0.00 | 0 | 0.00 |
| PREV_SK_ID_PREV_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_SELLER_INDUSTRY_Jewelry_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Everyday expenses_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_TYPE_SUITE_Other_B_MEAN | 0.00 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_Cash Street: low_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Insurance_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Jewelry_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Auto Accessories_MEAN | 0.00 | 0 | 0.00 |
| PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Education_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Vehicles_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Hobby_MEAN | 0.00 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_XNA_MEAN | 0.00 | 0 | 0.00 |
| PREV_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday_MEAN | 0.00 | 0 | 0.00 |
| PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_MEAN | 0.00 | 0 | 0.00 |
| PREV_NAME_TYPE_SUITE_Other_A_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_SELLER_INDUSTRY_Auto technology_MEAN | 0.01 | 0 | 0.00 |
| PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_PORTFOLIO_Cash_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Buying a used car_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Gasification / water supply_MEAN | 0.01 | 0 | 0.00 |
| PREV_FLAG_LAST_APPL_PER_CONTRACT_N_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_CONTRACT_TYPE_XNA_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Medicine_MEAN | 0.01 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_Card X-Sell_MEAN | 0.01 | 0 | 0.00 |
| PREV_CHANNEL_TYPE_Credit and cash offices_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Building a house or an annex_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Car repairs_MEAN | 0.01 | 0 | 0.00 |
| PREV_CHANNEL_TYPE_Contact center_MEAN | 0.01 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_SCO_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_CONTRACT_STATUS_Canceled_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Payments on other loans_MEAN | 0.01 | 0 | 0.00 |
| PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN | 0.02 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_Mobile_MEAN | 0.02 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_Cash Street: middle_MEAN | 0.02 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_Cash_MEAN | 0.02 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Urgent needs_MEAN | 0.02 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Other_MEAN | 0.02 | 0 | 0.00 |
| PREV_NAME_CLIENT_TYPE_New_MEAN | 0.02 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_POS mobile with interest_MEAN | 0.02 | 0 | 0.00 |
| PREV_NAME_CASH_LOAN_PURPOSE_Repairs_MEAN | 0.02 | 0 | 0.00 |
| PREV_EXTRA_AMT_PAID_MEAN | 0.02 | 417 | 0.14 |
| PREV_NAME_SELLER_INDUSTRY_XNA_MEAN | 0.02 | 0 | 0.00 |
| PREV_CNT_PAYMENT_MEAN | 0.03 | 415 | 0.14 |
| PREV_NAME_SELLER_INDUSTRY_Connectivity_MEAN | 0.03 | 0 | 0.00 |
| PREV_NAME_PORTFOLIO_XNA_MEAN | 0.03 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_Cash Street: high_MEAN | 0.03 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_Cash X-Sell: high_MEAN | 0.03 | 0 | 0.00 |
| PREV_RATE_INTEREST_PRIVILEGED_MEAN | 0.03 | 286448 | 98.42 |
| PREV_NAME_PORTFOLIO_Cards_MEAN | 0.03 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_XNA_MEAN | 0.03 | 0 | 0.00 |
| PREV_DAYS_TERMINATION_MEAN | 0.03 | 24043 | 8.26 |
| PREV_DAYS_LAST_DUE_MEAN | 0.03 | 21945 | 7.54 |
| PREV_Amount_credit_rejected_MEAN | 0.03 | 0 | 0.00 |
| PREV_CHANNEL_TYPE_AP+ (Cash loan)_MEAN | 0.03 | 0 | 0.00 |
| PREV_NAME_PAYMENT_TYPE_XNA_MEAN | 0.03 | 0 | 0.00 |
| PREV_NAME_YIELD_GROUP_high_MEAN | 0.04 | 0 | 0.00 |
| PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN | 0.04 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_LIMIT_MEAN | 0.04 | 0 | 0.00 |
| PREV_DAYS_FIRST_DUE_MEAN | 0.04 | 1827 | 0.63 |
| PREV_PRODUCT_COMBINATION_Card Street_MEAN | 0.04 | 0 | 0.00 |
| PREV_DAYS_LAST_DUE_1ST_VERSION_MEAN | 0.04 | 4038 | 1.39 |
| PREV_NAME_YIELD_GROUP_XNA_MEAN | 0.04 | 0 | 0.00 |
| PREV_DAYS_DECISION_MEAN | 0.05 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_HC_MEAN | 0.05 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_SCOFR_MEAN | 0.06 | 0 | 0.00 |
| PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | 0.06 | 0 | 0.00 |
| PREV_NAME_CONTRACT_STATUS_Refused_MEAN | 0.08 | 0 | 0.00 |
| PREV_DAYS_FIRST_DRAWING_MEAN | 0.10 | 237705 | 81.67 |
| TARGET | 1.00 | 0 | 0.00 |
| PREV_NAME_GOODS_CATEGORY_House Construction_MEAN | NaN | 0 | 0.00 |
prev = select_numerical(df_prev, correlation=0.03, missing=70).drop('TARGET', axis=1)
prev
Data Frame a 168 colonnes. Dont 168 colonnes contiennent des valeurs manquantes.
'>>>> Most correlated with TARGET from df_prev'
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| PREV_PREV_APP_XAP_MEAN | -0.07 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_XAP_MEAN | -0.07 | 0 | 0.00 |
| PREV_NAME_CONTRACT_STATUS_Approved_MEAN | -0.06 | 0 | 0.00 |
| PREV_Amount_credit_accepted_MEAN | -0.04 | 0 | 0.00 |
| PREV_NAME_YIELD_GROUP_low_normal_MEAN | -0.04 | 0 | 0.00 |
| PREV_PRODUCT_COMBINATION_Cash X-Sell: low_MEAN | -0.04 | 0 | 0.00 |
| PREV_HOUR_APPR_PROCESS_START_MEAN | -0.04 | 0 | 0.00 |
| PREV_NAME_YIELD_GROUP_high_MEAN | 0.04 | 0 | 0.00 |
| PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN | 0.04 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_LIMIT_MEAN | 0.04 | 0 | 0.00 |
| PREV_DAYS_FIRST_DUE_MEAN | 0.04 | 1827 | 0.63 |
| PREV_PRODUCT_COMBINATION_Card Street_MEAN | 0.04 | 0 | 0.00 |
| PREV_DAYS_LAST_DUE_1ST_VERSION_MEAN | 0.04 | 4038 | 1.39 |
| PREV_NAME_YIELD_GROUP_XNA_MEAN | 0.04 | 0 | 0.00 |
| PREV_DAYS_DECISION_MEAN | 0.05 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_HC_MEAN | 0.05 | 0 | 0.00 |
| PREV_CODE_REJECT_REASON_SCOFR_MEAN | 0.06 | 0 | 0.00 |
| PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | 0.06 | 0 | 0.00 |
| PREV_NAME_CONTRACT_STATUS_Refused_MEAN | 0.08 | 0 | 0.00 |
| PREV_DAYS_FIRST_DRAWING_MEAN | 0.10 | 237705 | 81.67 |
| TARGET | 1.00 | 0 | 0.00 |
| PREV_PREV_APP_XAP_MEAN | PREV_CODE_REJECT_REASON_XAP_MEAN | PREV_NAME_CONTRACT_STATUS_Approved_MEAN | PREV_Amount_credit_accepted_MEAN | PREV_NAME_YIELD_GROUP_low_normal_MEAN | PREV_PRODUCT_COMBINATION_Cash X-Sell: low_MEAN | PREV_HOUR_APPR_PROCESS_START_MEAN | PREV_NAME_YIELD_GROUP_high_MEAN | PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN | PREV_CODE_REJECT_REASON_LIMIT_MEAN | PREV_DAYS_FIRST_DUE_MEAN | PREV_PRODUCT_COMBINATION_Card Street_MEAN | PREV_DAYS_LAST_DUE_1ST_VERSION_MEAN | PREV_NAME_YIELD_GROUP_XNA_MEAN | PREV_DAYS_DECISION_MEAN | PREV_CODE_REJECT_REASON_HC_MEAN | PREV_CODE_REJECT_REASON_SCOFR_MEAN | PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | PREV_NAME_CONTRACT_STATUS_Refused_MEAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||
| 100002 | 1.000000 | 1.000000 | 1.000000 | 179055.00 | 1.000000 | 0.000000 | 9.000000 | 0.000000 | 0.000000 | 0.000000 | -565.000000 | 0.000000 | 125.000000 | 0.000000 | -606.000000 | 0.00 | 0.0 | 0.000000 | 0.000000 |
| 100003 | 1.000000 | 1.000000 | 1.000000 | 484191.00 | 0.333333 | 0.333333 | 14.666667 | 0.000000 | 0.000000 | 0.000000 | -1274.333333 | 0.000000 | -1004.333333 | 0.000000 | -1305.000000 | 0.00 | 0.0 | 0.000000 | 0.000000 |
| 100004 | 1.000000 | 1.000000 | 1.000000 | 20106.00 | 0.000000 | 0.000000 | 5.000000 | 0.000000 | 0.000000 | 0.000000 | -784.000000 | 0.000000 | -694.000000 | 0.000000 | -815.000000 | 0.00 | 0.0 | 0.000000 | 0.000000 |
| 100006 | 0.888889 | 0.888889 | 0.555556 | 190960.50 | 0.222222 | 0.222222 | 14.666667 | 0.222222 | 0.222222 | 0.111111 | -325.666667 | 0.111111 | 364.333333 | 0.444444 | -272.444444 | 0.00 | 0.0 | 0.000000 | 0.111111 |
| 100007 | 1.000000 | 1.000000 | 1.000000 | 166638.75 | 0.000000 | 0.000000 | 12.333333 | 0.500000 | 0.000000 | 0.000000 | -1263.200000 | 0.000000 | -837.200000 | 0.000000 | -1222.833333 | 0.00 | 0.0 | 0.166667 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456251 | 1.000000 | 1.000000 | 1.000000 | 40455.00 | 0.000000 | 0.000000 | 17.000000 | 1.000000 | 0.000000 | 0.000000 | -210.000000 | 0.000000 | 0.000000 | 0.000000 | -273.000000 | 0.00 | 0.0 | 0.000000 | 0.000000 |
| 456252 | 1.000000 | 1.000000 | 1.000000 | 56821.50 | 1.000000 | 0.000000 | 10.000000 | 0.000000 | 0.000000 | 0.000000 | -2466.000000 | 0.000000 | -2316.000000 | 0.000000 | -2497.000000 | 0.00 | 0.0 | 0.000000 | 0.000000 |
| 456253 | 1.000000 | 1.000000 | 1.000000 | 20625.75 | 0.000000 | 0.000000 | 11.500000 | 1.000000 | 0.000000 | 0.000000 | -2339.000000 | 0.000000 | -2219.000000 | 0.000000 | -2380.000000 | 0.00 | 0.0 | 0.000000 | 0.000000 |
| 456254 | 1.000000 | 1.000000 | 1.000000 | 134439.75 | 0.500000 | 0.000000 | 15.000000 | 0.500000 | 0.000000 | 0.000000 | -269.000000 | 0.000000 | 151.000000 | 0.000000 | -299.500000 | 0.00 | 0.0 | 0.000000 | 0.000000 |
| 456255 | 0.750000 | 0.750000 | 0.750000 | 285313.50 | 0.250000 | 0.125000 | 14.625000 | 0.250000 | 0.125000 | 0.000000 | -648.333333 | 0.125000 | -108.333333 | 0.125000 | -587.625000 | 0.25 | 0.0 | 0.250000 | 0.250000 |
291057 rows × 19 columns
frame_vs_target(prev,app_train.TARGET,'Variables aggregated on SK_ID_CURR from previous_appliction.csv ')
import gc
del prev_data
del prev_agg
del df_prev
gc.collect()
47012
pos_data = pd.read_csv('../donnees/POS_CASH_balance.csv')
pos_data.shape
(10001358, 8)
pos_data.head()
| SK_ID_PREV | SK_ID_CURR | MONTHS_BALANCE | CNT_INSTALMENT | CNT_INSTALMENT_FUTURE | NAME_CONTRACT_STATUS | SK_DPD | SK_DPD_DEF | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1803195 | 182943 | -31 | 48.0 | 45.0 | Active | 0 | 0 |
| 1 | 1715348 | 367990 | -33 | 36.0 | 35.0 | Active | 0 | 0 |
| 2 | 1784872 | 397406 | -32 | 12.0 | 9.0 | Active | 0 | 0 |
| 3 | 1903291 | 269225 | -35 | 48.0 | 42.0 | Active | 0 | 0 |
| 4 | 2341044 | 334279 | -35 | 36.0 | 35.0 | Active | 0 | 0 |
pos_data['NAME_CONTRACT_STATUS'].value_counts()
Active 9151119 Completed 744883 Signed 87260 Demand 7065 Returned to the store 5461 Approved 4917 Amortized debt 636 Canceled 15 XNA 2 Name: NAME_CONTRACT_STATUS, dtype: int64
pos_data['PAID_INSTALMENT']=pos_data['CNT_INSTALMENT']-pos_data['CNT_INSTALMENT_FUTURE']
pos_data=pos_data[pos_data['NAME_CONTRACT_STATUS']!='XNA']
#one hotcoding
pos_data, pos_data_cat_columns, _=one_hot_encoding_dataframe(pos_data)
pos_data.describe()
| SK_ID_PREV | SK_ID_CURR | MONTHS_BALANCE | CNT_INSTALMENT | CNT_INSTALMENT_FUTURE | SK_DPD | SK_DPD_DEF | PAID_INSTALMENT | NAME_CONTRACT_STATUS_Active | NAME_CONTRACT_STATUS_Amortized debt | NAME_CONTRACT_STATUS_Approved | NAME_CONTRACT_STATUS_Canceled | NAME_CONTRACT_STATUS_Completed | NAME_CONTRACT_STATUS_Demand | NAME_CONTRACT_STATUS_Returned to the store | NAME_CONTRACT_STATUS_Signed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.000136e+07 | 1.000136e+07 | 1.000136e+07 | 9.975287e+06 | 9.975271e+06 | 1.000136e+07 | 1.000136e+07 | 9.975174e+06 | 1.000136e+07 | 1.000136e+07 | 1.000136e+07 | 1.000136e+07 | 1.000136e+07 | 1.000136e+07 | 1.000136e+07 | 1.000136e+07 |
| mean | 1.903217e+06 | 2.784039e+05 | -3.501259e+01 | 1.708965e+01 | 1.048384e+01 | 1.160693e+01 | 6.544686e-01 | 6.605944e+00 | 9.149878e-01 | 6.359138e-05 | 4.916333e-04 | 1.499797e-06 | 7.447820e-02 | 7.064042e-04 | 5.460260e-04 | 8.724817e-03 |
| std | 5.358466e+05 | 1.027637e+05 | 2.606657e+01 | 1.199506e+01 | 1.110906e+01 | 1.327141e+02 | 3.276249e+01 | 5.923767e+00 | 2.788998e-01 | 7.974167e-03 | 2.216736e-02 | 1.224661e-03 | 2.625475e-01 | 2.656888e-02 | 2.336082e-02 | 9.299836e-02 |
| min | 1.000001e+06 | 1.000010e+05 | -9.600000e+01 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -5.100000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 1.434405e+06 | 1.895500e+05 | -5.400000e+01 | 1.000000e+01 | 3.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 50% | 1.896565e+06 | 2.786540e+05 | -2.800000e+01 | 1.200000e+01 | 7.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 75% | 2.368963e+06 | 3.674290e+05 | -1.300000e+01 | 2.400000e+01 | 1.400000e+01 | 0.000000e+00 | 0.000000e+00 | 9.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| max | 2.843499e+06 | 4.562550e+05 | -1.000000e+00 | 9.200000e+01 | 8.500000e+01 | 4.231000e+03 | 3.595000e+03 | 7.200000e+01 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
#Aggregation
pos_data_agg={}
for col in pos_data.columns:
if col!='SK_ID_CURR' and col !='SK_ID_PREV':
pos_data_agg[col]=['mean','min','max']
#Apply Aggregation
pos_agg = pos_data.groupby('SK_ID_CURR').agg(pos_data_agg)
pos_agg.head()
| MONTHS_BALANCE | CNT_INSTALMENT | CNT_INSTALMENT_FUTURE | SK_DPD | SK_DPD_DEF | PAID_INSTALMENT | NAME_CONTRACT_STATUS_Active | NAME_CONTRACT_STATUS_Amortized debt | NAME_CONTRACT_STATUS_Approved | NAME_CONTRACT_STATUS_Canceled | NAME_CONTRACT_STATUS_Completed | NAME_CONTRACT_STATUS_Demand | NAME_CONTRACT_STATUS_Returned to the store | NAME_CONTRACT_STATUS_Signed | |||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | mean | min | max | |
| SK_ID_CURR | ||||||||||||||||||||||||||||||||||||||||||
| 100001 | -72.555556 | -96 | -53 | 4.000000 | 4.0 | 4.0 | 1.444444 | 0.0 | 4.0 | 0.777778 | 0 | 7 | 0.777778 | 0 | 7 | 2.555556 | 0.0 | 4.0 | 0.777778 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.222222 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 |
| 100002 | -10.000000 | -19 | -1 | 24.000000 | 24.0 | 24.0 | 15.000000 | 6.0 | 24.0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | 9.000000 | 0.0 | 18.0 | 1.000000 | 1 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 |
| 100003 | -43.785714 | -77 | -18 | 10.107143 | 6.0 | 12.0 | 5.785714 | 0.0 | 12.0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | 4.321429 | 0.0 | 11.0 | 0.928571 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.071429 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 |
| 100004 | -25.500000 | -27 | -24 | 3.750000 | 3.0 | 4.0 | 2.250000 | 0.0 | 4.0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | 1.500000 | 0.0 | 3.0 | 0.750000 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.250000 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 |
| 100005 | -20.000000 | -25 | -15 | 11.700000 | 9.0 | 12.0 | 7.200000 | 0.0 | 12.0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | 4.500000 | 0.0 | 9.0 | 0.818182 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.090909 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.090909 | 0 | 1 |
# unstack columns name
modified_col=[]
for c in list(pos_agg.columns):
modified_col.append("POS_"+c[0]+"_"+c[1].upper())
pos_agg.columns=modified_col
pos_agg.head()
| POS_MONTHS_BALANCE_MEAN | POS_MONTHS_BALANCE_MIN | POS_MONTHS_BALANCE_MAX | POS_CNT_INSTALMENT_MEAN | POS_CNT_INSTALMENT_MIN | POS_CNT_INSTALMENT_MAX | POS_CNT_INSTALMENT_FUTURE_MEAN | POS_CNT_INSTALMENT_FUTURE_MIN | POS_CNT_INSTALMENT_FUTURE_MAX | POS_SK_DPD_MEAN | POS_SK_DPD_MIN | POS_SK_DPD_MAX | POS_SK_DPD_DEF_MEAN | POS_SK_DPD_DEF_MIN | POS_SK_DPD_DEF_MAX | POS_PAID_INSTALMENT_MEAN | POS_PAID_INSTALMENT_MIN | POS_PAID_INSTALMENT_MAX | POS_NAME_CONTRACT_STATUS_Active_MEAN | POS_NAME_CONTRACT_STATUS_Active_MIN | POS_NAME_CONTRACT_STATUS_Active_MAX | POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN | POS_NAME_CONTRACT_STATUS_Amortized debt_MIN | POS_NAME_CONTRACT_STATUS_Amortized debt_MAX | POS_NAME_CONTRACT_STATUS_Approved_MEAN | POS_NAME_CONTRACT_STATUS_Approved_MIN | POS_NAME_CONTRACT_STATUS_Approved_MAX | POS_NAME_CONTRACT_STATUS_Canceled_MEAN | POS_NAME_CONTRACT_STATUS_Canceled_MIN | POS_NAME_CONTRACT_STATUS_Canceled_MAX | POS_NAME_CONTRACT_STATUS_Completed_MEAN | POS_NAME_CONTRACT_STATUS_Completed_MIN | POS_NAME_CONTRACT_STATUS_Completed_MAX | POS_NAME_CONTRACT_STATUS_Demand_MEAN | POS_NAME_CONTRACT_STATUS_Demand_MIN | POS_NAME_CONTRACT_STATUS_Demand_MAX | POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN | POS_NAME_CONTRACT_STATUS_Returned to the store_MIN | POS_NAME_CONTRACT_STATUS_Returned to the store_MAX | POS_NAME_CONTRACT_STATUS_Signed_MEAN | POS_NAME_CONTRACT_STATUS_Signed_MIN | POS_NAME_CONTRACT_STATUS_Signed_MAX | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||||||||||||||||||||||||||||||||
| 100001 | -72.555556 | -96 | -53 | 4.000000 | 4.0 | 4.0 | 1.444444 | 0.0 | 4.0 | 0.777778 | 0 | 7 | 0.777778 | 0 | 7 | 2.555556 | 0.0 | 4.0 | 0.777778 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.222222 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 |
| 100002 | -10.000000 | -19 | -1 | 24.000000 | 24.0 | 24.0 | 15.000000 | 6.0 | 24.0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | 9.000000 | 0.0 | 18.0 | 1.000000 | 1 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 |
| 100003 | -43.785714 | -77 | -18 | 10.107143 | 6.0 | 12.0 | 5.785714 | 0.0 | 12.0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | 4.321429 | 0.0 | 11.0 | 0.928571 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.071429 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 |
| 100004 | -25.500000 | -27 | -24 | 3.750000 | 3.0 | 4.0 | 2.250000 | 0.0 | 4.0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | 1.500000 | 0.0 | 3.0 | 0.750000 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.250000 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 |
| 100005 | -20.000000 | -25 | -15 | 11.700000 | 9.0 | 12.0 | 7.200000 | 0.0 | 12.0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | 4.500000 | 0.0 | 9.0 | 0.818182 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.090909 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.090909 | 0 | 1 |
pos_agg['ACTVIE_COMP']=pos_agg['POS_NAME_CONTRACT_STATUS_Completed_MEAN']-pos_agg['POS_NAME_CONTRACT_STATUS_Active_MEAN']
pos_agg.shape
(337252, 43)
df_pos = pos_agg.join(app_train.TARGET, how='inner', on='SK_ID_CURR')
df_pos.name = 'df_pos'
df_pos.shape
(289444, 44)
df_pos.head()
| POS_MONTHS_BALANCE_MEAN | POS_MONTHS_BALANCE_MIN | POS_MONTHS_BALANCE_MAX | POS_CNT_INSTALMENT_MEAN | POS_CNT_INSTALMENT_MIN | POS_CNT_INSTALMENT_MAX | POS_CNT_INSTALMENT_FUTURE_MEAN | POS_CNT_INSTALMENT_FUTURE_MIN | POS_CNT_INSTALMENT_FUTURE_MAX | POS_SK_DPD_MEAN | POS_SK_DPD_MIN | POS_SK_DPD_MAX | POS_SK_DPD_DEF_MEAN | POS_SK_DPD_DEF_MIN | POS_SK_DPD_DEF_MAX | POS_PAID_INSTALMENT_MEAN | POS_PAID_INSTALMENT_MIN | POS_PAID_INSTALMENT_MAX | POS_NAME_CONTRACT_STATUS_Active_MEAN | POS_NAME_CONTRACT_STATUS_Active_MIN | POS_NAME_CONTRACT_STATUS_Active_MAX | POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN | POS_NAME_CONTRACT_STATUS_Amortized debt_MIN | POS_NAME_CONTRACT_STATUS_Amortized debt_MAX | POS_NAME_CONTRACT_STATUS_Approved_MEAN | POS_NAME_CONTRACT_STATUS_Approved_MIN | POS_NAME_CONTRACT_STATUS_Approved_MAX | POS_NAME_CONTRACT_STATUS_Canceled_MEAN | POS_NAME_CONTRACT_STATUS_Canceled_MIN | POS_NAME_CONTRACT_STATUS_Canceled_MAX | POS_NAME_CONTRACT_STATUS_Completed_MEAN | POS_NAME_CONTRACT_STATUS_Completed_MIN | POS_NAME_CONTRACT_STATUS_Completed_MAX | POS_NAME_CONTRACT_STATUS_Demand_MEAN | POS_NAME_CONTRACT_STATUS_Demand_MIN | POS_NAME_CONTRACT_STATUS_Demand_MAX | POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN | POS_NAME_CONTRACT_STATUS_Returned to the store_MIN | POS_NAME_CONTRACT_STATUS_Returned to the store_MAX | POS_NAME_CONTRACT_STATUS_Signed_MEAN | POS_NAME_CONTRACT_STATUS_Signed_MIN | POS_NAME_CONTRACT_STATUS_Signed_MAX | ACTVIE_COMP | TARGET | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||||||||||||||||||||||||||||||||||
| 100002 | -10.000000 | -19 | -1 | 24.000000 | 24.0 | 24.0 | 15.000000 | 6.0 | 24.0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 9.000000 | 0.0 | 18.0 | 1.000000 | 1 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | -1.000000 | 1 |
| 100003 | -43.785714 | -77 | -18 | 10.107143 | 6.0 | 12.0 | 5.785714 | 0.0 | 12.0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 4.321429 | 0.0 | 11.0 | 0.928571 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.071429 | 0 | 1 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | -0.857143 | 0 |
| 100004 | -25.500000 | -27 | -24 | 3.750000 | 3.0 | 4.0 | 2.250000 | 0.0 | 4.0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 1.500000 | 0.0 | 3.0 | 0.750000 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.250000 | 0 | 1 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 | -0.500000 | 0 |
| 100006 | -9.619048 | -20 | -1 | 12.000000 | 1.0 | 48.0 | 8.650000 | 0.0 | 48.0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 3.350000 | 0.0 | 9.0 | 0.857143 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.095238 | 0 | 1 | 0.0 | 0 | 0 | 0.047619 | 0 | 1 | 0.000000 | 0 | 0 | -0.761905 | 0 |
| 100007 | -33.636364 | -77 | -1 | 15.333333 | 10.0 | 24.0 | 8.969697 | 0.0 | 24.0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 6.363636 | 0.0 | 17.0 | 0.939394 | 0 | 1 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.045455 | 0 | 1 | 0.0 | 0 | 0 | 0.000000 | 0 | 0 | 0.015152 | 0 | 1 | -0.893939 | 0 |
numerical_summary(df_pos)
Data Frame a 44 colonnes. Dont 44 colonnes contiennent des valeurs manquantes.
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| POS_PAID_INSTALMENT_MAX | -0.02 | 24 | 0.01 |
| POS_NAME_CONTRACT_STATUS_Completed_MAX | -0.02 | 0 | 0.00 |
| POS_PAID_INSTALMENT_MEAN | -0.01 | 24 | 0.01 |
| POS_NAME_CONTRACT_STATUS_Active_MEAN | -0.01 | 0 | 0.00 |
| POS_MONTHS_BALANCE_MAX | -0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Signed_MAX | -0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Active_MAX | -0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Approved_MAX | -0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Approved_MIN | -0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Completed_MIN | -0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Canceled_MEAN | -0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Completed_MEAN | 0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Approved_MEAN | 0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Canceled_MAX | 0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Returned to the store_MIN | 0.00 | 0 | 0.00 |
| ACTVIE_COMP | 0.00 | 0 | 0.00 |
| POS_SK_DPD_DEF_MIN | 0.00 | 0 | 0.00 |
| POS_SK_DPD_MAX | 0.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Signed_MIN | 0.00 | 0 | 0.00 |
| POS_SK_DPD_MEAN | 0.01 | 0 | 0.00 |
| POS_SK_DPD_MIN | 0.01 | 0 | 0.00 |
| POS_PAID_INSTALMENT_MIN | 0.01 | 24 | 0.01 |
| POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN | 0.01 | 0 | 0.00 |
| POS_SK_DPD_DEF_MEAN | 0.01 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Demand_MEAN | 0.01 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Returned to the store_MAX | 0.01 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Amortized debt_MAX | 0.01 | 0 | 0.00 |
| POS_SK_DPD_DEF_MAX | 0.01 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Signed_MEAN | 0.01 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Demand_MAX | 0.01 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN | 0.01 | 0 | 0.00 |
| POS_CNT_INSTALMENT_MAX | 0.01 | 24 | 0.01 |
| POS_CNT_INSTALMENT_FUTURE_MAX | 0.01 | 24 | 0.01 |
| POS_NAME_CONTRACT_STATUS_Active_MIN | 0.02 | 0 | 0.00 |
| POS_CNT_INSTALMENT_MEAN | 0.02 | 24 | 0.01 |
| POS_CNT_INSTALMENT_FUTURE_MIN | 0.02 | 24 | 0.01 |
| POS_CNT_INSTALMENT_MIN | 0.02 | 24 | 0.01 |
| POS_CNT_INSTALMENT_FUTURE_MEAN | 0.03 | 24 | 0.01 |
| POS_MONTHS_BALANCE_MEAN | 0.03 | 0 | 0.00 |
| POS_MONTHS_BALANCE_MIN | 0.06 | 0 | 0.00 |
| TARGET | 1.00 | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Amortized debt_MIN | NaN | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Canceled_MIN | NaN | 0 | 0.00 |
| POS_NAME_CONTRACT_STATUS_Demand_MIN | NaN | 0 | 0.00 |
pos = select_numerical(df_pos, correlation=0.03, missing=70).drop('TARGET', axis=1)
pos
Data Frame a 44 colonnes. Dont 44 colonnes contiennent des valeurs manquantes.
'>>>> Most correlated with TARGET from df_pos'
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| POS_MONTHS_BALANCE_MIN | 0.06 | 0 | 0.0 |
| TARGET | 1.00 | 0 | 0.0 |
| POS_MONTHS_BALANCE_MIN | |
|---|---|
| SK_ID_CURR | |
| 100002 | -19 |
| 100003 | -77 |
| 100004 | -27 |
| 100006 | -20 |
| 100007 | -77 |
| ... | ... |
| 456251 | -9 |
| 456252 | -82 |
| 456253 | -96 |
| 456254 | -11 |
| 456255 | -33 |
289444 rows × 1 columns
frame_vs_target(pos,app_train.TARGET,'Variable aggregated on SK_ID_CURR from pos_cash_balance.csv ')
import gc
del pos_data
del pos_agg
del df_pos
gc.collect()
2625
credit_data = pd.read_csv('../donnees/credit_card_balance.csv')
credit_data.head()
| SK_ID_PREV | SK_ID_CURR | MONTHS_BALANCE | AMT_BALANCE | AMT_CREDIT_LIMIT_ACTUAL | AMT_DRAWINGS_ATM_CURRENT | AMT_DRAWINGS_CURRENT | AMT_DRAWINGS_OTHER_CURRENT | AMT_DRAWINGS_POS_CURRENT | AMT_INST_MIN_REGULARITY | AMT_PAYMENT_CURRENT | AMT_PAYMENT_TOTAL_CURRENT | AMT_RECEIVABLE_PRINCIPAL | AMT_RECIVABLE | AMT_TOTAL_RECEIVABLE | CNT_DRAWINGS_ATM_CURRENT | CNT_DRAWINGS_CURRENT | CNT_DRAWINGS_OTHER_CURRENT | CNT_DRAWINGS_POS_CURRENT | CNT_INSTALMENT_MATURE_CUM | NAME_CONTRACT_STATUS | SK_DPD | SK_DPD_DEF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2562384 | 378907 | -6 | 56.970 | 135000 | 0.0 | 877.5 | 0.0 | 877.5 | 1700.325 | 1800.0 | 1800.0 | 0.000 | 0.000 | 0.000 | 0.0 | 1 | 0.0 | 1.0 | 35.0 | Active | 0 | 0 |
| 1 | 2582071 | 363914 | -1 | 63975.555 | 45000 | 2250.0 | 2250.0 | 0.0 | 0.0 | 2250.000 | 2250.0 | 2250.0 | 60175.080 | 64875.555 | 64875.555 | 1.0 | 1 | 0.0 | 0.0 | 69.0 | Active | 0 | 0 |
| 2 | 1740877 | 371185 | -7 | 31815.225 | 450000 | 0.0 | 0.0 | 0.0 | 0.0 | 2250.000 | 2250.0 | 2250.0 | 26926.425 | 31460.085 | 31460.085 | 0.0 | 0 | 0.0 | 0.0 | 30.0 | Active | 0 | 0 |
| 3 | 1389973 | 337855 | -4 | 236572.110 | 225000 | 2250.0 | 2250.0 | 0.0 | 0.0 | 11795.760 | 11925.0 | 11925.0 | 224949.285 | 233048.970 | 233048.970 | 1.0 | 1 | 0.0 | 0.0 | 10.0 | Active | 0 | 0 |
| 4 | 1891521 | 126868 | -1 | 453919.455 | 450000 | 0.0 | 11547.0 | 0.0 | 11547.0 | 22924.890 | 27000.0 | 27000.0 | 443044.395 | 453919.455 | 453919.455 | 0.0 | 1 | 0.0 | 1.0 | 101.0 | Active | 0 | 0 |
credit_data['NAME_CONTRACT_STATUS'].value_counts()
Active 3698436 Completed 128918 Signed 11058 Demand 1365 Sent proposal 513 Refused 17 Approved 5 Name: NAME_CONTRACT_STATUS, dtype: int64
credit_data['FLAG_GRT_30']=(credit_data['AMT_DRAWINGS_CURRENT']>(0.30*credit_data['AMT_CREDIT_LIMIT_ACTUAL'])).astype(int)
credit_data, credit_data_cat_columns, _ = one_hot_encoding_dataframe(credit_data)
credit_data.describe()
| SK_ID_PREV | SK_ID_CURR | MONTHS_BALANCE | AMT_BALANCE | AMT_CREDIT_LIMIT_ACTUAL | AMT_DRAWINGS_ATM_CURRENT | AMT_DRAWINGS_CURRENT | AMT_DRAWINGS_OTHER_CURRENT | AMT_DRAWINGS_POS_CURRENT | AMT_INST_MIN_REGULARITY | AMT_PAYMENT_CURRENT | AMT_PAYMENT_TOTAL_CURRENT | AMT_RECEIVABLE_PRINCIPAL | AMT_RECIVABLE | AMT_TOTAL_RECEIVABLE | CNT_DRAWINGS_ATM_CURRENT | CNT_DRAWINGS_CURRENT | CNT_DRAWINGS_OTHER_CURRENT | CNT_DRAWINGS_POS_CURRENT | CNT_INSTALMENT_MATURE_CUM | SK_DPD | SK_DPD_DEF | FLAG_GRT_30 | NAME_CONTRACT_STATUS_Active | NAME_CONTRACT_STATUS_Approved | NAME_CONTRACT_STATUS_Completed | NAME_CONTRACT_STATUS_Demand | NAME_CONTRACT_STATUS_Refused | NAME_CONTRACT_STATUS_Sent proposal | NAME_CONTRACT_STATUS_Signed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.090496e+06 | 3.840312e+06 | 3.090496e+06 | 3.090496e+06 | 3.535076e+06 | 3.072324e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.090496e+06 | 3.840312e+06 | 3.090496e+06 | 3.090496e+06 | 3.535076e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 | 3.840312e+06 |
| mean | 1.904504e+06 | 2.783242e+05 | -3.452192e+01 | 5.830016e+04 | 1.538080e+05 | 5.961325e+03 | 7.433388e+03 | 2.881696e+02 | 2.968805e+03 | 3.540204e+03 | 1.028054e+04 | 7.588857e+03 | 5.596588e+04 | 5.808881e+04 | 5.809829e+04 | 3.094490e-01 | 7.031439e-01 | 4.812496e-03 | 5.594791e-01 | 2.082508e+01 | 9.283667e+00 | 3.316220e-01 | 3.801280e-02 | 9.630561e-01 | 1.301978e-06 | 3.356967e-02 | 3.554399e-04 | 4.426724e-06 | 1.335829e-04 | 2.879454e-03 |
| std | 5.364695e+05 | 1.027045e+05 | 2.666775e+01 | 1.063070e+05 | 1.651457e+05 | 2.822569e+04 | 3.384608e+04 | 8.201989e+03 | 2.079689e+04 | 5.600154e+03 | 3.607808e+04 | 3.200599e+04 | 1.025336e+05 | 1.059654e+05 | 1.059718e+05 | 1.100401e+00 | 3.190347e+00 | 8.263861e-02 | 3.240649e+00 | 2.005149e+01 | 9.751570e+01 | 2.147923e+01 | 1.912272e-01 | 1.886241e-01 | 1.141042e-03 | 1.801187e-01 | 1.884976e-02 | 2.103974e-03 | 1.155704e-02 | 5.358323e-02 |
| min | 1.000018e+06 | 1.000060e+05 | -9.600000e+01 | -4.202502e+05 | 0.000000e+00 | -6.827310e+03 | -6.211620e+03 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -4.233058e+05 | -4.202502e+05 | -4.202502e+05 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 1.434385e+06 | 1.895170e+05 | -5.500000e+01 | 0.000000e+00 | 4.500000e+04 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.523700e+02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 4.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 50% | 1.897122e+06 | 2.783960e+05 | -2.800000e+01 | 0.000000e+00 | 1.125000e+05 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.702700e+03 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.500000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 75% | 2.369328e+06 | 3.675800e+05 | -1.100000e+01 | 8.904669e+04 | 1.800000e+05 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 6.633911e+03 | 9.000000e+03 | 6.750000e+03 | 8.535924e+04 | 8.889949e+04 | 8.891451e+04 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 3.200000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| max | 2.843496e+06 | 4.562500e+05 | -1.000000e+00 | 1.505902e+06 | 1.350000e+06 | 2.115000e+06 | 2.287098e+06 | 1.529847e+06 | 2.239274e+06 | 2.028820e+05 | 4.289207e+06 | 4.278316e+06 | 1.472317e+06 | 1.493338e+06 | 1.493338e+06 | 5.100000e+01 | 1.650000e+02 | 1.200000e+01 | 1.650000e+02 | 1.200000e+02 | 3.260000e+03 | 3.260000e+03 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
credit_data_agg={}
for col in credit_data.columns:
if col!='SK_ID_CURR' and col !='SK_ID_PREV':
credit_data_agg[col]=['mean']
if (col=='FLAG_GRT_30') | (col=='FLAG_GRT_40')| (col=='FLAG_GRT_50') |(col=='FLAG_GRT_60')| (col=='FLAG_GRT_100') :
credit_data_agg[col]=['sum']
credit_agg = credit_data.groupby('SK_ID_CURR').agg(credit_data_agg)
credit_agg.head()
| MONTHS_BALANCE | AMT_BALANCE | AMT_CREDIT_LIMIT_ACTUAL | AMT_DRAWINGS_ATM_CURRENT | AMT_DRAWINGS_CURRENT | AMT_DRAWINGS_OTHER_CURRENT | AMT_DRAWINGS_POS_CURRENT | AMT_INST_MIN_REGULARITY | AMT_PAYMENT_CURRENT | AMT_PAYMENT_TOTAL_CURRENT | AMT_RECEIVABLE_PRINCIPAL | AMT_RECIVABLE | AMT_TOTAL_RECEIVABLE | CNT_DRAWINGS_ATM_CURRENT | CNT_DRAWINGS_CURRENT | CNT_DRAWINGS_OTHER_CURRENT | CNT_DRAWINGS_POS_CURRENT | CNT_INSTALMENT_MATURE_CUM | SK_DPD | SK_DPD_DEF | FLAG_GRT_30 | NAME_CONTRACT_STATUS_Active | NAME_CONTRACT_STATUS_Approved | NAME_CONTRACT_STATUS_Completed | NAME_CONTRACT_STATUS_Demand | NAME_CONTRACT_STATUS_Refused | NAME_CONTRACT_STATUS_Sent proposal | NAME_CONTRACT_STATUS_Signed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | mean | sum | mean | mean | mean | mean | mean | mean | mean | |
| SK_ID_CURR | ||||||||||||||||||||||||||||
| 100006 | -3.5 | 0.000000 | 270000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | 0.000000 | 0.000000 | 0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100011 | -38.5 | 54482.111149 | 164189.189189 | 2432.432432 | 2432.432432 | 0.0 | 0.0 | 3956.221849 | 4843.064189 | 4520.067568 | 52402.088919 | 54433.179122 | 54433.179122 | 0.054054 | 0.054054 | 0.0 | 0.0 | 25.767123 | 0.000000 | 0.000000 | 1 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100013 | -48.5 | 18159.919219 | 131718.750000 | 6350.000000 | 5953.125000 | 0.0 | 0.0 | 1454.539551 | 7168.346250 | 6817.172344 | 17255.559844 | 18101.079844 | 18101.079844 | 0.255556 | 0.239583 | 0.0 | 0.0 | 18.719101 | 0.010417 | 0.010417 | 4 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100021 | -10.0 | 0.000000 | 675000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | 0.000000 | 0.000000 | 0 | 0.411765 | 0.0 | 0.588235 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100023 | -7.5 | 0.000000 | 135000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | 0.000000 | 0.000000 | 0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
# Unstack column manes
modified_col=[]
for c in list(credit_agg.columns):
modified_col.append("CRED_"+c[0]+"_"+c[1].upper())
credit_agg.columns=modified_col
credit_agg.head()
| CRED_MONTHS_BALANCE_MEAN | CRED_AMT_BALANCE_MEAN | CRED_AMT_CREDIT_LIMIT_ACTUAL_MEAN | CRED_AMT_DRAWINGS_ATM_CURRENT_MEAN | CRED_AMT_DRAWINGS_CURRENT_MEAN | CRED_AMT_DRAWINGS_OTHER_CURRENT_MEAN | CRED_AMT_DRAWINGS_POS_CURRENT_MEAN | CRED_AMT_INST_MIN_REGULARITY_MEAN | CRED_AMT_PAYMENT_CURRENT_MEAN | CRED_AMT_PAYMENT_TOTAL_CURRENT_MEAN | CRED_AMT_RECEIVABLE_PRINCIPAL_MEAN | CRED_AMT_RECIVABLE_MEAN | CRED_AMT_TOTAL_RECEIVABLE_MEAN | CRED_CNT_DRAWINGS_ATM_CURRENT_MEAN | CRED_CNT_DRAWINGS_CURRENT_MEAN | CRED_CNT_DRAWINGS_OTHER_CURRENT_MEAN | CRED_CNT_DRAWINGS_POS_CURRENT_MEAN | CRED_CNT_INSTALMENT_MATURE_CUM_MEAN | CRED_SK_DPD_MEAN | CRED_SK_DPD_DEF_MEAN | CRED_FLAG_GRT_30_SUM | CRED_NAME_CONTRACT_STATUS_Active_MEAN | CRED_NAME_CONTRACT_STATUS_Approved_MEAN | CRED_NAME_CONTRACT_STATUS_Completed_MEAN | CRED_NAME_CONTRACT_STATUS_Demand_MEAN | CRED_NAME_CONTRACT_STATUS_Refused_MEAN | CRED_NAME_CONTRACT_STATUS_Sent proposal_MEAN | CRED_NAME_CONTRACT_STATUS_Signed_MEAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||||||||||||||||||
| 100006 | -3.5 | 0.000000 | 270000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | 0.000000 | 0.000000 | 0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100011 | -38.5 | 54482.111149 | 164189.189189 | 2432.432432 | 2432.432432 | 0.0 | 0.0 | 3956.221849 | 4843.064189 | 4520.067568 | 52402.088919 | 54433.179122 | 54433.179122 | 0.054054 | 0.054054 | 0.0 | 0.0 | 25.767123 | 0.000000 | 0.000000 | 1 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100013 | -48.5 | 18159.919219 | 131718.750000 | 6350.000000 | 5953.125000 | 0.0 | 0.0 | 1454.539551 | 7168.346250 | 6817.172344 | 17255.559844 | 18101.079844 | 18101.079844 | 0.255556 | 0.239583 | 0.0 | 0.0 | 18.719101 | 0.010417 | 0.010417 | 4 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100021 | -10.0 | 0.000000 | 675000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | 0.000000 | 0.000000 | 0 | 0.411765 | 0.0 | 0.588235 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100023 | -7.5 | 0.000000 | 135000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | 0.000000 | 0.000000 | 0 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
month = -3
cred_temp = credit_data[credit_data.MONTHS_BALANCE >= month].copy()
cred_temp['CRED_UTIL'] = cred_temp['AMT_BALANCE'] / cred_temp['AMT_CREDIT_LIMIT_ACTUAL']
credit_agg['CREDIT_UTIL_MAX'] = cred_temp.groupby('SK_ID_CURR')['CRED_UTIL'].max()
df_credit = credit_agg.join(app_train.TARGET, how='inner', on='SK_ID_CURR')
df_credit.name = 'df_credit'
numerical_summary(df_credit)
Data Frame a 30 colonnes. Dont 30 colonnes contiennent des valeurs manquantes.
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| CRED_CNT_INSTALMENT_MATURE_CUM_MEAN | -0.03 | 0 | 0.00 |
| CRED_NAME_CONTRACT_STATUS_Completed_MEAN | -0.02 | 0 | 0.00 |
| CRED_NAME_CONTRACT_STATUS_Sent proposal_MEAN | -0.01 | 0 | 0.00 |
| CRED_AMT_CREDIT_LIMIT_ACTUAL_MEAN | -0.01 | 0 | 0.00 |
| CRED_AMT_DRAWINGS_POS_CURRENT_MEAN | -0.00 | 25765 | 29.65 |
| CRED_SK_DPD_MEAN | -0.00 | 0 | 0.00 |
| CRED_NAME_CONTRACT_STATUS_Approved_MEAN | -0.00 | 0 | 0.00 |
| CRED_NAME_CONTRACT_STATUS_Signed_MEAN | 0.00 | 0 | 0.00 |
| CRED_NAME_CONTRACT_STATUS_Refused_MEAN | 0.00 | 0 | 0.00 |
| CRED_NAME_CONTRACT_STATUS_Demand_MEAN | 0.00 | 0 | 0.00 |
| CRED_AMT_PAYMENT_CURRENT_MEAN | 0.01 | 25845 | 29.74 |
| CRED_SK_DPD_DEF_MEAN | 0.01 | 0 | 0.00 |
| CRED_AMT_DRAWINGS_OTHER_CURRENT_MEAN | 0.01 | 25765 | 29.65 |
| CRED_CNT_DRAWINGS_OTHER_CURRENT_MEAN | 0.01 | 25765 | 29.65 |
| CRED_NAME_CONTRACT_STATUS_Active_MEAN | 0.02 | 0 | 0.00 |
| CRED_AMT_PAYMENT_TOTAL_CURRENT_MEAN | 0.02 | 0 | 0.00 |
| CRED_FLAG_GRT_30_SUM | 0.03 | 0 | 0.00 |
| CRED_CNT_DRAWINGS_POS_CURRENT_MEAN | 0.05 | 25765 | 29.65 |
| CRED_AMT_DRAWINGS_CURRENT_MEAN | 0.06 | 0 | 0.00 |
| CRED_AMT_DRAWINGS_ATM_CURRENT_MEAN | 0.06 | 25765 | 29.65 |
| CRED_MONTHS_BALANCE_MEAN | 0.06 | 0 | 0.00 |
| CRED_AMT_INST_MIN_REGULARITY_MEAN | 0.07 | 0 | 0.00 |
| CRED_CNT_DRAWINGS_CURRENT_MEAN | 0.08 | 0 | 0.00 |
| CRED_AMT_RECEIVABLE_PRINCIPAL_MEAN | 0.09 | 0 | 0.00 |
| CRED_AMT_RECIVABLE_MEAN | 0.09 | 0 | 0.00 |
| CRED_AMT_TOTAL_RECEIVABLE_MEAN | 0.09 | 0 | 0.00 |
| CRED_AMT_BALANCE_MEAN | 0.09 | 0 | 0.00 |
| CRED_CNT_DRAWINGS_ATM_CURRENT_MEAN | 0.11 | 25765 | 29.65 |
| CREDIT_UTIL_MAX | 0.16 | 21247 | 24.45 |
| TARGET | 1.00 | 0 | 0.00 |
credit = select_numerical(df_credit, correlation=0.03, missing=70).drop('TARGET', axis=1)
credit
Data Frame a 30 colonnes. Dont 30 colonnes contiennent des valeurs manquantes.
'>>>> Most correlated with TARGET from df_credit'
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| CRED_CNT_DRAWINGS_POS_CURRENT_MEAN | 0.05 | 25765 | 29.65 |
| CRED_AMT_DRAWINGS_CURRENT_MEAN | 0.06 | 0 | 0.00 |
| CRED_AMT_DRAWINGS_ATM_CURRENT_MEAN | 0.06 | 25765 | 29.65 |
| CRED_MONTHS_BALANCE_MEAN | 0.06 | 0 | 0.00 |
| CRED_AMT_INST_MIN_REGULARITY_MEAN | 0.07 | 0 | 0.00 |
| CRED_CNT_DRAWINGS_CURRENT_MEAN | 0.08 | 0 | 0.00 |
| CRED_AMT_RECEIVABLE_PRINCIPAL_MEAN | 0.09 | 0 | 0.00 |
| CRED_AMT_RECIVABLE_MEAN | 0.09 | 0 | 0.00 |
| CRED_AMT_TOTAL_RECEIVABLE_MEAN | 0.09 | 0 | 0.00 |
| CRED_AMT_BALANCE_MEAN | 0.09 | 0 | 0.00 |
| CRED_CNT_DRAWINGS_ATM_CURRENT_MEAN | 0.11 | 25765 | 29.65 |
| CREDIT_UTIL_MAX | 0.16 | 21247 | 24.45 |
| TARGET | 1.00 | 0 | 0.00 |
| CRED_CNT_DRAWINGS_POS_CURRENT_MEAN | CRED_AMT_DRAWINGS_CURRENT_MEAN | CRED_AMT_DRAWINGS_ATM_CURRENT_MEAN | CRED_MONTHS_BALANCE_MEAN | CRED_AMT_INST_MIN_REGULARITY_MEAN | CRED_CNT_DRAWINGS_CURRENT_MEAN | CRED_AMT_RECEIVABLE_PRINCIPAL_MEAN | CRED_AMT_RECIVABLE_MEAN | CRED_AMT_TOTAL_RECEIVABLE_MEAN | CRED_AMT_BALANCE_MEAN | CRED_CNT_DRAWINGS_ATM_CURRENT_MEAN | CREDIT_UTIL_MAX | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||
| 100006 | NaN | 0.000000 | NaN | -3.5 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 |
| 100011 | 0.000000 | 2432.432432 | 2432.432432 | -38.5 | 3956.221849 | 0.054054 | 52402.088919 | 54433.179122 | 54433.179122 | 54482.111149 | 0.054054 | 0.000000 |
| 100021 | NaN | 0.000000 | NaN | -10.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 |
| 100023 | NaN | 0.000000 | NaN | -7.5 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | NaN |
| 100036 | NaN | 0.000000 | NaN | -7.5 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456242 | 2.875000 | 39849.783750 | 23062.500000 | -4.5 | 6980.152500 | 3.875000 | 144650.565000 | 147757.578750 | 147757.578750 | 148232.328750 | 1.000000 | 0.965666 |
| 456244 | 0.317073 | 26842.388049 | 24475.609756 | -21.0 | 6514.200000 | 1.365854 | 127608.373537 | 130767.060732 | 130767.060732 | 131834.730732 | 1.048780 | NaN |
| 456246 | 2.500000 | 15199.256250 | 0.000000 | -5.5 | 1439.150625 | 2.500000 | 12883.016250 | 12897.894375 | 12897.894375 | 13136.731875 | 0.000000 | 0.000464 |
| 456247 | 0.031579 | 2149.506474 | 2136.315789 | -49.0 | 1414.704789 | 0.147368 | 22100.653895 | 23128.243105 | 23128.243105 | 23216.396211 | 0.115789 | NaN |
| 456248 | NaN | 0.000000 | NaN | -13.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 |
86905 rows × 12 columns
frame_vs_target(credit,app_train.TARGET,'Variables aggregated on SK_ID_CURR from credit_card_balance.csv ')
import gc
del credit_data
del credit_agg
del df_credit
gc.collect()
29098
installments_payments_data = pd.read_csv('../donnees/installments_payments.csv')
installments_payments_data.head()
| SK_ID_PREV | SK_ID_CURR | NUM_INSTALMENT_VERSION | NUM_INSTALMENT_NUMBER | DAYS_INSTALMENT | DAYS_ENTRY_PAYMENT | AMT_INSTALMENT | AMT_PAYMENT | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1054186 | 161674 | 1.0 | 6 | -1180.0 | -1187.0 | 6948.360 | 6948.360 |
| 1 | 1330831 | 151639 | 0.0 | 34 | -2156.0 | -2156.0 | 1716.525 | 1716.525 |
| 2 | 2085231 | 193053 | 2.0 | 1 | -63.0 | -63.0 | 25425.000 | 25425.000 |
| 3 | 2452527 | 199697 | 1.0 | 3 | -2418.0 | -2426.0 | 24350.130 | 24350.130 |
| 4 | 2714724 | 167756 | 1.0 | 2 | -1383.0 | -1366.0 | 2165.040 | 2160.585 |
installments_payments_data['NUM_INSTALMENT_VERSION'].value_counts()
1.0 8485004 0.0 4082498 2.0 620283 3.0 237063 4.0 55274 5.0 48404 6.0 17092 7.0 16771 9.0 8359 8.0 7814 10.0 4637 11.0 4342 13.0 2951 12.0 2863 15.0 1917 14.0 1906 16.0 1283 17.0 1249 18.0 883 19.0 816 20.0 615 21.0 589 22.0 426 23.0 373 24.0 291 25.0 268 26.0 211 27.0 177 29.0 163 28.0 145 30.0 107 31.0 101 32.0 72 33.0 63 34.0 62 35.0 46 39.0 42 36.0 36 37.0 33 38.0 29 40.0 19 43.0 19 41.0 18 42.0 18 44.0 13 45.0 8 61.0 8 72.0 7 46.0 5 47.0 4 48.0 3 50.0 3 52.0 3 49.0 3 51.0 2 178.0 1 57.0 1 68.0 1 55.0 1 58.0 1 59.0 1 53.0 1 56.0 1 54.0 1 73.0 1 Name: NUM_INSTALMENT_VERSION, dtype: int64
installments_payments_data['LATE_PAYMENT']=((installments_payments_data['DAYS_INSTALMENT']-installments_payments_data['DAYS_ENTRY_PAYMENT'])>0).astype(int)
installments_payments_data['LESS_PAYMENT']=((installments_payments_data['AMT_INSTALMENT']-installments_payments_data['AMT_PAYMENT'])>0).astype(int)
installments_payments_data.head()
| SK_ID_PREV | SK_ID_CURR | NUM_INSTALMENT_VERSION | NUM_INSTALMENT_NUMBER | DAYS_INSTALMENT | DAYS_ENTRY_PAYMENT | AMT_INSTALMENT | AMT_PAYMENT | LATE_PAYMENT | LESS_PAYMENT | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1054186 | 161674 | 1.0 | 6 | -1180.0 | -1187.0 | 6948.360 | 6948.360 | 1 | 0 |
| 1 | 1330831 | 151639 | 0.0 | 34 | -2156.0 | -2156.0 | 1716.525 | 1716.525 | 0 | 0 |
| 2 | 2085231 | 193053 | 2.0 | 1 | -63.0 | -63.0 | 25425.000 | 25425.000 | 0 | 0 |
| 3 | 2452527 | 199697 | 1.0 | 3 | -2418.0 | -2426.0 | 24350.130 | 24350.130 | 1 | 0 |
| 4 | 2714724 | 167756 | 1.0 | 2 | -1383.0 | -1366.0 | 2165.040 | 2160.585 | 0 | 1 |
for col in installments_payments_data.columns:
if col.startswith('DAYS'):
installments_payments_data[col].replace(365243, np.nan, inplace= True)
installments_payments_data, installments_payments_cat_columns, _ = one_hot_encoding_dataframe(installments_payments_data)
installments_payments_data.describe()
| SK_ID_PREV | SK_ID_CURR | NUM_INSTALMENT_VERSION | NUM_INSTALMENT_NUMBER | DAYS_INSTALMENT | DAYS_ENTRY_PAYMENT | AMT_INSTALMENT | AMT_PAYMENT | LATE_PAYMENT | LESS_PAYMENT | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.360540e+07 | 1.360540e+07 | 1.360540e+07 | 1.360540e+07 | 1.360540e+07 | 1.360250e+07 | 1.360540e+07 | 1.360250e+07 | 1.360540e+07 | 1.360540e+07 |
| mean | 1.903365e+06 | 2.784449e+05 | 8.566373e-01 | 1.887090e+01 | -1.042270e+03 | -1.051114e+03 | 1.705091e+04 | 1.723822e+04 | 6.842486e-01 | 9.521902e-02 |
| std | 5.362029e+05 | 1.027183e+05 | 1.035216e+00 | 2.666407e+01 | 8.009463e+02 | 8.005859e+02 | 5.057025e+04 | 5.473578e+04 | 4.648144e-01 | 2.935172e-01 |
| min | 1.000001e+06 | 1.000010e+05 | 0.000000e+00 | 1.000000e+00 | -2.922000e+03 | -4.921000e+03 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 1.434191e+06 | 1.896390e+05 | 0.000000e+00 | 4.000000e+00 | -1.654000e+03 | -1.662000e+03 | 4.226085e+03 | 3.398265e+03 | 0.000000e+00 | 0.000000e+00 |
| 50% | 1.896520e+06 | 2.786850e+05 | 1.000000e+00 | 8.000000e+00 | -8.180000e+02 | -8.270000e+02 | 8.884080e+03 | 8.125515e+03 | 1.000000e+00 | 0.000000e+00 |
| 75% | 2.369094e+06 | 3.675300e+05 | 1.000000e+00 | 1.900000e+01 | -3.610000e+02 | -3.700000e+02 | 1.671021e+04 | 1.610842e+04 | 1.000000e+00 | 0.000000e+00 |
| max | 2.843499e+06 | 4.562550e+05 | 1.780000e+02 | 2.770000e+02 | -1.000000e+00 | -1.000000e+00 | 3.771488e+06 | 3.771488e+06 | 1.000000e+00 | 1.000000e+00 |
installments_payments_data_agg={}
for col in installments_payments_data.columns:
if col!='SK_ID_CURR' and col !='SK_ID_PREV':
installments_payments_data_agg[col]=['mean']
if (col=='LATE_PAYMENT') | (col=='LESS_PAYMENT'):
installments_payments_data_agg[col]=['sum']
installments_payments_agg = installments_payments_data.groupby('SK_ID_CURR').agg(installments_payments_data_agg)
installments_payments_agg.head()
| NUM_INSTALMENT_VERSION | NUM_INSTALMENT_NUMBER | DAYS_INSTALMENT | DAYS_ENTRY_PAYMENT | AMT_INSTALMENT | AMT_PAYMENT | LATE_PAYMENT | LESS_PAYMENT | |
|---|---|---|---|---|---|---|---|---|
| mean | mean | mean | mean | mean | mean | sum | sum | |
| SK_ID_CURR | ||||||||
| 100001 | 1.142857 | 2.714286 | -2187.714286 | -2195.000000 | 5885.132143 | 5885.132143 | 4 | 0 |
| 100002 | 1.052632 | 10.000000 | -295.000000 | -315.421053 | 11559.247105 | 11559.247105 | 19 | 0 |
| 100003 | 1.040000 | 5.080000 | -1378.160000 | -1385.320000 | 64754.586000 | 64754.586000 | 25 | 0 |
| 100004 | 1.333333 | 2.000000 | -754.000000 | -761.666667 | 7096.155000 | 7096.155000 | 3 | 0 |
| 100005 | 1.111111 | 5.000000 | -586.000000 | -609.555556 | 6240.205000 | 6240.205000 | 8 | 0 |
# Unstack column names
modified_col=[]
for c in list(installments_payments_agg.columns):
modified_col.append("INST_"+c[0]+"_"+c[1].upper())
installments_payments_agg.columns=modified_col
installment_temp = installments_payments_data[installments_payments_data.DAYS_ENTRY_PAYMENT >= -365].copy()
installment_temp['LATE_PAYMENT'] = installment_temp['DAYS_INSTALMENT']-installment_temp['DAYS_ENTRY_PAYMENT']
installments_payments_agg['MIN_LATE_PAYMENT_365'] = installment_temp.groupby('SK_ID_CURR')[['LATE_PAYMENT']].min()
# installment_temp = installments_payments_data[installments_payments_data.DAYS_ENTRY_PAYMENT >= -365].copy()
installment_temp['LESS_PAYMENT'] =installment_temp['AMT_INSTALMENT']-installment_temp['AMT_PAYMENT']
installments_payments_agg['COUNT_LESS_PAYMENT_365'] = installment_temp.groupby('SK_ID_CURR')[['LESS_PAYMENT']].count()
installments_payments_agg.head()
| INST_NUM_INSTALMENT_VERSION_MEAN | INST_NUM_INSTALMENT_NUMBER_MEAN | INST_DAYS_INSTALMENT_MEAN | INST_DAYS_ENTRY_PAYMENT_MEAN | INST_AMT_INSTALMENT_MEAN | INST_AMT_PAYMENT_MEAN | INST_LATE_PAYMENT_SUM | INST_LESS_PAYMENT_SUM | MIN_LATE_PAYMENT_365 | COUNT_LESS_PAYMENT_365 | |
|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||
| 100001 | 1.142857 | 2.714286 | -2187.714286 | -2195.000000 | 5885.132143 | 5885.132143 | 4 | 0 | NaN | NaN |
| 100002 | 1.052632 | 10.000000 | -295.000000 | -315.421053 | 11559.247105 | 11559.247105 | 19 | 0 | 12.0 | 11.0 |
| 100003 | 1.040000 | 5.080000 | -1378.160000 | -1385.320000 | 64754.586000 | 64754.586000 | 25 | 0 | NaN | NaN |
| 100004 | 1.333333 | 2.000000 | -754.000000 | -761.666667 | 7096.155000 | 7096.155000 | 3 | 0 | NaN | NaN |
| 100005 | 1.111111 | 5.000000 | -586.000000 | -609.555556 | 6240.205000 | 6240.205000 | 8 | 0 | NaN | NaN |
df_installments_payments = installments_payments_agg.join(app_train.TARGET, how='inner', on='SK_ID_CURR')
df_installments_payments.name = 'df_installments_payments'
numerical_summary(df_installments_payments)
Data Frame a 11 colonnes. Dont 11 colonnes contiennent des valeurs manquantes.
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| INST_LATE_PAYMENT_SUM | -0.04 | 0 | 0.00 |
| INST_NUM_INSTALMENT_VERSION_MEAN | -0.03 | 0 | 0.00 |
| INST_AMT_PAYMENT_MEAN | -0.02 | 8 | 0.00 |
| INST_AMT_INSTALMENT_MEAN | -0.02 | 0 | 0.00 |
| MIN_LATE_PAYMENT_365 | -0.01 | 75770 | 25.98 |
| INST_NUM_INSTALMENT_NUMBER_MEAN | -0.01 | 0 | 0.00 |
| INST_LESS_PAYMENT_SUM | 0.03 | 0 | 0.00 |
| COUNT_LESS_PAYMENT_365 | 0.03 | 75770 | 25.98 |
| INST_DAYS_INSTALMENT_MEAN | 0.04 | 0 | 0.00 |
| INST_DAYS_ENTRY_PAYMENT_MEAN | 0.04 | 8 | 0.00 |
| TARGET | 1.00 | 0 | 0.00 |
installments_payments = select_numerical(df_installments_payments, correlation=0.03, missing=70).drop('TARGET', axis=1)
installments_payments
Data Frame a 11 colonnes. Dont 11 colonnes contiennent des valeurs manquantes.
'>>>> Most correlated with TARGET from df_installments_payments'
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| INST_LATE_PAYMENT_SUM | -0.04 | 0 | 0.0 |
| INST_DAYS_INSTALMENT_MEAN | 0.04 | 0 | 0.0 |
| INST_DAYS_ENTRY_PAYMENT_MEAN | 0.04 | 8 | 0.0 |
| TARGET | 1.00 | 0 | 0.0 |
| INST_LATE_PAYMENT_SUM | INST_DAYS_INSTALMENT_MEAN | INST_DAYS_ENTRY_PAYMENT_MEAN | |
|---|---|---|---|
| SK_ID_CURR | |||
| 100002 | 19 | -295.000000 | -315.421053 |
| 100003 | 25 | -1378.160000 | -1385.320000 |
| 100004 | 3 | -754.000000 | -761.666667 |
| 100006 | 16 | -252.250000 | -271.625000 |
| 100007 | 41 | -1028.606061 | -1032.242424 |
| ... | ... | ... | ... |
| 456251 | 7 | -120.000000 | -156.285714 |
| 456252 | 4 | -2391.000000 | -2393.833333 |
| 456253 | 8 | -2372.928571 | -2387.428571 |
| 456254 | 19 | -142.263158 | -161.263158 |
| 456255 | 58 | -463.945946 | -472.013514 |
291643 rows × 3 columns
frame_vs_target(installments_payments,app_train.TARGET,'Variables aggregated on SK_ID_CURR from installements_payments.csv ')
import gc
del installments_payments_data
del installments_payments_agg
del df_installments_payments
gc.collect()
7685
# cleaned_app_train
selected_numerical_app_train
| EXT_SOURCE_3 | EXT_SOURCE_2 | EXT_SOURCE_1 | DAYS_EMPLOYED | FLOORSMAX_AVG | FLOORSMAX_MEDI | FLOORSMAX_MODE | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_REGISTRATION | FLAG_DOCUMENT_3 | REG_CITY_NOT_LIVE_CITY | FLAG_EMP_PHONE | REG_CITY_NOT_WORK_CITY | DAYS_ID_PUBLISH | DAYS_LAST_PHONE_CHANGE | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | DAYS_BIRTH | TARGET | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||||||||||
| 100002 | 0.139376 | 0.262949 | 0.083037 | -637.0 | 0.0833 | 0.0833 | 0.0833 | 351000.0 | 0.018801 | -3648.0 | 1 | 0 | 1 | 0 | -2120 | -1134.0 | 2 | 2 | -9461 | 1 |
| 100003 | NaN | 0.622246 | 0.311267 | -1188.0 | 0.2917 | 0.2917 | 0.2917 | 1129500.0 | 0.003541 | -1186.0 | 1 | 0 | 1 | 0 | -291 | -828.0 | 1 | 1 | -16765 | 0 |
| 100004 | 0.729567 | 0.555912 | NaN | -225.0 | NaN | NaN | NaN | 135000.0 | 0.010032 | -4260.0 | 0 | 0 | 1 | 0 | -2531 | -815.0 | 2 | 2 | -19046 | 0 |
| 100006 | NaN | 0.650442 | NaN | -3039.0 | NaN | NaN | NaN | 297000.0 | 0.008019 | -9833.0 | 1 | 0 | 1 | 0 | -2437 | -617.0 | 2 | 2 | -19005 | 0 |
| 100007 | NaN | 0.322738 | NaN | -3038.0 | NaN | NaN | NaN | 513000.0 | 0.028663 | -4311.0 | 0 | 0 | 1 | 1 | -3458 | -1106.0 | 2 | 2 | -19932 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456251 | NaN | 0.681632 | 0.145570 | -236.0 | 0.6042 | 0.6042 | 0.4583 | 225000.0 | 0.032561 | -8456.0 | 0 | 0 | 1 | 0 | -1982 | -273.0 | 1 | 1 | -9327 | 0 |
| 456252 | NaN | 0.115992 | NaN | NaN | 0.0833 | 0.0833 | 0.0833 | 225000.0 | 0.025164 | -4388.0 | 1 | 0 | 0 | 0 | -4090 | 0.0 | 2 | 2 | -20775 | 0 |
| 456253 | 0.218859 | 0.535722 | 0.744026 | -7921.0 | 0.1667 | 0.1667 | 0.1667 | 585000.0 | 0.005002 | -6737.0 | 1 | 0 | 1 | 1 | -5150 | -1909.0 | 3 | 3 | -14966 | 0 |
| 456254 | 0.661024 | 0.514163 | NaN | -4786.0 | 0.0417 | 0.0417 | 0.0417 | 319500.0 | 0.005313 | -2562.0 | 1 | 1 | 1 | 1 | -931 | -322.0 | 2 | 2 | -11961 | 1 |
| 456255 | 0.113922 | 0.708569 | 0.734460 | -1262.0 | 0.3750 | 0.3750 | 0.3750 | 675000.0 | 0.046220 | -5128.0 | 1 | 0 | 1 | 1 | -410 | -787.0 | 1 | 1 | -16856 | 0 |
307510 rows × 20 columns
domain_features
| DOCUMENT_COUNT | AMT_REQ_CREDIT_BUREAU_HDWMQY | DAYS_WORKING_PER | DAYS_UNEMPLOYED | GOODS_PRICE_INCOME_TOTAL_PER | GOODS_PRICE_CREDIT_PER | GOODS_PRICE_AMT_ANNUITY_PER | GOODS_PRICE_EMP | AMT_CREDIT_BIRTH | INCOME_CREDIT_PER | INCOME_PER_PERSON | REST_TO_LIVE | ANNUITY_DAYS_EMPLOYED_PERC | AMT_CREDIT_DAYS_EMPLOYED_PERC | ANNUITY_DAYS_BIRTH_PERC | PAYMENT_RATE | PAYMENT_RATE_INV | PAY_TOWARDS_LOAN | MEAN_DEFAULT_SURR | ADDRESS_MISSMATCH | MEAN_ENQUIRIES | CONTACT_REF | MAX_DAYS_SOMETHING_CHANGED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||||
| 100002 | 1 | 1.0 | 0.067329 | 8824.0 | 0.576923 | 1.158397 | 0.070372 | 0.001815 | 15686.300338 | 0.498036 | 101250.0 | 177799.50 | -0.025789 | -0.001567 | -0.383029 | 0.060749 | 16.461104 | 177799.5 | 2.0 | 0 | 0.166667 | 4 | -637.0 |
| 100003 | 1 | 0.0 | 0.070862 | 15577.0 | 0.239044 | 1.145199 | 0.031606 | 0.001052 | 28161.551596 | 0.208736 | 90000.0 | 117150.75 | -0.033279 | -0.000918 | -0.469628 | 0.027598 | 36.234085 | 234301.5 | 0.0 | 0 | 0.000000 | 4 | -291.0 |
| 100004 | 0 | 0.0 | 0.011814 | 18821.0 | 0.500000 | 1.000000 | 0.050000 | 0.001667 | 2587.157408 | 0.500000 | 33750.0 | 60750.00 | -0.033333 | -0.001667 | -2.821630 | 0.050000 | 20.000000 | 60750.0 | 0.0 | 0 | 0.000000 | 5 | -225.0 |
| 100006 | 1 | 0.0 | 0.159905 | 15966.0 | 0.454545 | 1.052803 | 0.099955 | 0.010232 | 6005.215075 | 0.431748 | 45000.0 | 52656.75 | -0.102370 | -0.009719 | -0.640190 | 0.094941 | 10.532818 | 105313.5 | 1.0 | 0 | NaN | 3 | -2437.0 |
| 100007 | 1 | 0.0 | 0.152418 | 16894.0 | 0.236842 | 1.000000 | 0.042623 | 0.005922 | 9394.190247 | 0.236842 | 60750.0 | 99634.50 | -0.138940 | -0.005922 | -0.911573 | 0.042623 | 23.461618 | 99634.5 | 0.0 | 2 | 0.000000 | 3 | -3038.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456251 | 1 | 0.0 | 0.025303 | 9091.0 | 0.700000 | 1.132000 | 0.122480 | 0.001049 | 9967.352847 | 0.618375 | 78750.0 | 129942.00 | -0.008564 | -0.000927 | -0.338450 | 0.108198 | 9.242325 | 129942.0 | 0.0 | 0 | NaN | 3 | -236.0 |
| 456252 | 1 | 0.0 | NaN | NaN | 0.320000 | 1.198000 | 0.053340 | NaN | 4735.776173 | 0.267112 | 36000.0 | 59998.50 | NaN | NaN | -1.731034 | 0.044524 | 22.459693 | 59998.5 | 0.0 | 0 | NaN | 3 | -4090.0 |
| 456253 | 1 | 3.0 | 0.529266 | 7045.0 | 0.261538 | 1.158400 | 0.051246 | 0.013540 | 16527.285848 | 0.225776 | 76500.0 | 123021.00 | -0.264218 | -0.011689 | -0.499216 | 0.044239 | 22.604623 | 123021.0 | 3.0 | 2 | 0.500000 | 4 | -5150.0 |
| 456254 | 1 | 0.0 | 0.400134 | 7175.0 | 0.535211 | 1.158394 | 0.063239 | 0.014980 | 11294.127163 | 0.462029 | 57000.0 | 75397.50 | -0.236872 | -0.012931 | -0.591982 | 0.054592 | 18.317595 | 150795.0 | 0.0 | 2 | 0.000000 | 3 | -931.0 |
| 456255 | 1 | 3.0 | 0.074869 | 15594.0 | 0.233333 | 1.000000 | 0.072767 | 0.001870 | 14616.457048 | 0.233333 | 52500.0 | 54191.25 | -0.025693 | -0.001870 | -0.343177 | 0.072767 | 13.742556 | 108382.5 | 0.0 | 2 | 0.500000 | 5 | -410.0 |
307463 rows × 23 columns
# data = cleaned_app_train.copy()
data = selected_numerical_app_train.copy()
data.shape
(307510, 20)
data = data.join(domain_features, how='left', on='SK_ID_CURR')
data.shape
(307510, 43)
data = data.join(bureau, how='left', on='SK_ID_CURR')
data.shape
(307510, 62)
data = data.join(prev, how='left', on='SK_ID_CURR')
data.shape
(307510, 81)
data = data.join(pos, how='left', on='SK_ID_CURR')
data.shape
(307510, 82)
data = data.join(credit, how='left', on='SK_ID_CURR')
data.shape
(307510, 94)
data = data.join(installments_payments, how='left', on='SK_ID_CURR')
data.name = 'data all files aggregated merged'
data.shape
(307510, 97)
data.dtypes.value_counts()
float64 88 int64 9 dtype: int64
numerical_features = select_numerical(data, correlation=0.04, missing=50)
numerical_features
Data Frame a 97 colonnes. Dont 97 colonnes contiennent des valeurs manquantes.
'>>>> Most correlated with TARGET from data all files aggregated merged'
| correlations | valeurs_manquantes | %_total | |
|---|---|---|---|
| EXT_SOURCE_3 | -0.18 | 60965 | 19.83 |
| EXT_SOURCE_2 | -0.16 | 660 | 0.21 |
| EXT_SOURCE_1 | -0.16 | 173378 | 56.38 |
| MONTHS_BALANCE_SIZE_MEAN | -0.08 | 215279 | 70.01 |
| CREDIT_ACTIVE_Closed_MEAN | -0.08 | 44020 | 14.31 |
| PREV_CODE_REJECT_REASON_XAP_MEAN | -0.07 | 16454 | 5.35 |
| PREV_PREV_APP_XAP_MEAN | -0.07 | 16454 | 5.35 |
| DAYS_WORKING_PER | -0.07 | 55400 | 18.02 |
| PREV_NAME_CONTRACT_STATUS_Approved_MEAN | -0.06 | 16454 | 5.35 |
| MONTHS_BALANCE_VAR_MEAN | -0.06 | 215510 | 70.08 |
| STATUS_12_C__MEAN | -0.06 | 217455 | 70.71 |
| STATUS_9_C__MEAN | -0.06 | 217814 | 70.83 |
| STATUS_6_C__MEAN | -0.06 | 218264 | 70.98 |
| STATUS_C_MEAN_MEAN | -0.06 | 215279 | 70.01 |
| STATUS_3_C__MEAN | -0.06 | 218810 | 71.16 |
| GOODS_PRICE_EMP | -0.05 | 55654 | 18.10 |
| FLAG_EMP_PHONE | 0.05 | 0 | 0.00 |
| PREV_DAYS_DECISION_MEAN | 0.05 | 16454 | 5.35 |
| DAYS_CREDIT_ENDDATE_MEAN | 0.05 | 46269 | 15.05 |
| AMT_CREDIT_DAYS_EMPLOYED_PERC | 0.05 | 55400 | 18.02 |
| REG_CITY_NOT_WORK_CITY | 0.05 | 0 | 0.00 |
| DAYS_ID_PUBLISH | 0.05 | 0 | 0.00 |
| CRED_CNT_DRAWINGS_POS_CURRENT_MEAN | 0.05 | 246370 | 80.12 |
| DAYS_ENDDATE_FACT_MEAN | 0.05 | 77156 | 25.09 |
| PREV_CODE_REJECT_REASON_HC_MEAN | 0.05 | 16454 | 5.35 |
| DAYS_LAST_PHONE_CHANGE | 0.06 | 1 | 0.00 |
| POS_MONTHS_BALANCE_MIN | 0.06 | 18067 | 5.88 |
| PREV_CODE_REJECT_REASON_SCOFR_MEAN | 0.06 | 16454 | 5.35 |
| PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | 0.06 | 16454 | 5.35 |
| ANNUITY_DAYS_EMPLOYED_PERC | 0.06 | 55400 | 18.02 |
| CRED_AMT_DRAWINGS_CURRENT_MEAN | 0.06 | 220605 | 71.74 |
| REGION_RATING_CLIENT | 0.06 | 0 | 0.00 |
| CRED_AMT_DRAWINGS_ATM_CURRENT_MEAN | 0.06 | 246370 | 80.12 |
| REGION_RATING_CLIENT_W_CITY | 0.06 | 0 | 0.00 |
| STATUS_1_MEAN_MEAN | 0.06 | 215279 | 70.01 |
| CRED_MONTHS_BALANCE_MEAN | 0.06 | 220605 | 71.74 |
| DAYS_CREDIT_UPDATE_MEAN | 0.07 | 44020 | 14.31 |
| GOODS_PRICE_CREDIT_PER | 0.07 | 323 | 0.11 |
| MAX_DAYS_SOMETHING_CHANGED | 0.07 | 47 | 0.02 |
| CRED_AMT_INST_MIN_REGULARITY_MEAN | 0.07 | 220605 | 71.74 |
| DAYS_EMPLOYED | 0.07 | 55374 | 18.01 |
| DAYS_CREDIT_MIN | 0.08 | 44020 | 14.31 |
| MONTHS_BALANCE_MEAN_MEAN | 0.08 | 215279 | 70.01 |
| CREDIT_ACTIVE_Active_MEAN | 0.08 | 44020 | 14.31 |
| PREV_NAME_CONTRACT_STATUS_Refused_MEAN | 0.08 | 16454 | 5.35 |
| DAYS_BIRTH | 0.08 | 0 | 0.00 |
| CRED_CNT_DRAWINGS_CURRENT_MEAN | 0.08 | 220605 | 71.74 |
| CRED_AMT_RECEIVABLE_PRINCIPAL_MEAN | 0.09 | 220605 | 71.74 |
| CRED_AMT_RECIVABLE_MEAN | 0.09 | 220605 | 71.74 |
| CRED_AMT_TOTAL_RECEIVABLE_MEAN | 0.09 | 220605 | 71.74 |
| CRED_AMT_BALANCE_MEAN | 0.09 | 220605 | 71.74 |
| MONTHS_BALANCE_MIN_MEAN | 0.09 | 215279 | 70.01 |
| DAYS_CREDIT_MEAN | 0.09 | 44020 | 14.31 |
| CRED_CNT_DRAWINGS_ATM_CURRENT_MEAN | 0.11 | 246370 | 80.12 |
| CREDIT_UTIL_MAX | 0.16 | 241852 | 78.65 |
| TARGET | 1.00 | 0 | 0.00 |
| EXT_SOURCE_3 | EXT_SOURCE_2 | CREDIT_ACTIVE_Closed_MEAN | PREV_CODE_REJECT_REASON_XAP_MEAN | PREV_PREV_APP_XAP_MEAN | DAYS_WORKING_PER | PREV_NAME_CONTRACT_STATUS_Approved_MEAN | GOODS_PRICE_EMP | FLAG_EMP_PHONE | PREV_DAYS_DECISION_MEAN | DAYS_CREDIT_ENDDATE_MEAN | AMT_CREDIT_DAYS_EMPLOYED_PERC | REG_CITY_NOT_WORK_CITY | DAYS_ID_PUBLISH | DAYS_ENDDATE_FACT_MEAN | PREV_CODE_REJECT_REASON_HC_MEAN | DAYS_LAST_PHONE_CHANGE | POS_MONTHS_BALANCE_MIN | PREV_CODE_REJECT_REASON_SCOFR_MEAN | PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | ANNUITY_DAYS_EMPLOYED_PERC | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | DAYS_CREDIT_UPDATE_MEAN | GOODS_PRICE_CREDIT_PER | MAX_DAYS_SOMETHING_CHANGED | DAYS_EMPLOYED | DAYS_CREDIT_MIN | CREDIT_ACTIVE_Active_MEAN | PREV_NAME_CONTRACT_STATUS_Refused_MEAN | DAYS_BIRTH | DAYS_CREDIT_MEAN | TARGET | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||||||||||||||
| 100002 | 0.139376 | 0.262949 | 0.750000 | 1.000000 | 1.000000 | 0.067329 | 1.000000 | 0.001815 | 1 | -606.000000 | -349.000000 | -0.001567 | 0 | -2120 | -697.500000 | 0.00 | -1134.0 | -19.0 | 0.0 | 0.000000 | -0.025789 | 2 | 2 | -499.875000 | 1.158397 | -637.0 | -637.0 | -1437.0 | 0.250000 | 0.000000 | -9461 | -874.000000 | 1 |
| 100003 | NaN | 0.622246 | 0.750000 | 1.000000 | 1.000000 | 0.070862 | 1.000000 | 0.001052 | 1 | -1305.000000 | -544.500000 | -0.000918 | 0 | -291 | -1097.333333 | 0.00 | -828.0 | -77.0 | 0.0 | 0.000000 | -0.033279 | 1 | 1 | -816.000000 | 1.145199 | -291.0 | -1188.0 | -2586.0 | 0.250000 | 0.000000 | -16765 | -1400.750000 | 0 |
| 100004 | 0.729567 | 0.555912 | 1.000000 | 1.000000 | 1.000000 | 0.011814 | 1.000000 | 0.001667 | 1 | -815.000000 | -488.500000 | -0.001667 | 0 | -2531 | -532.500000 | 0.00 | -815.0 | -27.0 | 0.0 | 0.000000 | -0.033333 | 2 | 2 | -532.000000 | 1.000000 | -225.0 | -225.0 | -1326.0 | 0.000000 | 0.000000 | -19046 | -867.000000 | 0 |
| 100006 | NaN | 0.650442 | NaN | 0.888889 | 0.888889 | 0.159905 | 0.555556 | 0.010232 | 1 | -272.444444 | NaN | -0.009719 | 0 | -2437 | NaN | 0.00 | -617.0 | -20.0 | 0.0 | 0.000000 | -0.102370 | 2 | 2 | NaN | 1.052803 | -2437.0 | -3039.0 | NaN | NaN | 0.111111 | -19005 | NaN | 0 |
| 100007 | NaN | 0.322738 | 1.000000 | 1.000000 | 1.000000 | 0.152418 | 1.000000 | 0.005922 | 1 | -1222.833333 | -783.000000 | -0.005922 | 1 | -3458 | -783.000000 | 0.00 | -1106.0 | -77.0 | 0.0 | 0.166667 | -0.138940 | 2 | 2 | -783.000000 | 1.000000 | -3038.0 | -3038.0 | -1149.0 | 0.000000 | 0.000000 | -19932 | -1149.000000 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456251 | NaN | 0.681632 | NaN | 1.000000 | 1.000000 | 0.025303 | 1.000000 | 0.001049 | 1 | -273.000000 | NaN | -0.000927 | 0 | -1982 | NaN | 0.00 | -273.0 | -9.0 | 0.0 | 0.000000 | -0.008564 | 1 | 1 | NaN | 1.132000 | -236.0 | -236.0 | NaN | NaN | 0.000000 | -9327 | NaN | 0 |
| 456252 | NaN | 0.115992 | NaN | 1.000000 | 1.000000 | NaN | 1.000000 | NaN | 0 | -2497.000000 | NaN | NaN | 0 | -4090 | NaN | 0.00 | 0.0 | -82.0 | 0.0 | 0.000000 | NaN | 2 | 2 | NaN | 1.198000 | -4090.0 | NaN | NaN | NaN | 0.000000 | -20775 | NaN | 0 |
| 456253 | 0.218859 | 0.535722 | 0.500000 | 1.000000 | 1.000000 | 0.529266 | 1.000000 | 0.013540 | 1 | -2380.000000 | 280.500000 | -0.011689 | 1 | -5150 | -794.000000 | 0.00 | -1909.0 | -96.0 | 0.0 | 0.000000 | -0.264218 | 3 | 3 | -253.250000 | 1.158400 | -5150.0 | -7921.0 | -919.0 | 0.500000 | 0.000000 | -14966 | -867.500000 | 0 |
| 456254 | 0.661024 | 0.514163 | 1.000000 | 1.000000 | 1.000000 | 0.400134 | 1.000000 | 0.014980 | 1 | -299.500000 | -859.000000 | -0.012931 | 1 | -931 | -859.000000 | 0.00 | -322.0 | -11.0 | 0.0 | 0.000000 | -0.236872 | 2 | 2 | -401.000000 | 1.158394 | -931.0 | -4786.0 | -1104.0 | 0.000000 | 0.000000 | -11961 | -1104.000000 | 1 |
| 456255 | 0.113922 | 0.708569 | 0.545455 | 0.750000 | 0.750000 | 0.074869 | 0.750000 | 0.001870 | 1 | -587.625000 | 3231.272727 | -0.001870 | 1 | -410 | -968.333333 | 0.25 | -787.0 | -33.0 | 0.0 | 0.250000 | -0.025693 | 1 | 1 | -531.090909 | 1.000000 | -410.0 | -1262.0 | -2337.0 | 0.454545 | 0.250000 | -16856 | -1089.454545 | 0 |
307510 rows × 33 columns
numerical_features.columns.to_list()
['EXT_SOURCE_3', 'EXT_SOURCE_2', 'CREDIT_ACTIVE_Closed_MEAN', 'PREV_CODE_REJECT_REASON_XAP_MEAN', 'PREV_PREV_APP_XAP_MEAN', 'DAYS_WORKING_PER', 'PREV_NAME_CONTRACT_STATUS_Approved_MEAN', 'GOODS_PRICE_EMP', 'FLAG_EMP_PHONE', 'PREV_DAYS_DECISION_MEAN', 'DAYS_CREDIT_ENDDATE_MEAN', 'AMT_CREDIT_DAYS_EMPLOYED_PERC', 'REG_CITY_NOT_WORK_CITY', 'DAYS_ID_PUBLISH', 'DAYS_ENDDATE_FACT_MEAN', 'PREV_CODE_REJECT_REASON_HC_MEAN', 'DAYS_LAST_PHONE_CHANGE', 'POS_MONTHS_BALANCE_MIN', 'PREV_CODE_REJECT_REASON_SCOFR_MEAN', 'PREV_NAME_PRODUCT_TYPE_walk-in_MEAN', 'ANNUITY_DAYS_EMPLOYED_PERC', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'DAYS_CREDIT_UPDATE_MEAN', 'GOODS_PRICE_CREDIT_PER', 'MAX_DAYS_SOMETHING_CHANGED', 'DAYS_EMPLOYED', 'DAYS_CREDIT_MIN', 'CREDIT_ACTIVE_Active_MEAN', 'PREV_NAME_CONTRACT_STATUS_Refused_MEAN', 'DAYS_BIRTH', 'DAYS_CREDIT_MEAN', 'TARGET']
app_train.select_dtypes('object').columns.to_list()
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
# Select numerical features of df (assign df.name before)
# onehot encode and compute f_score
# discard on p_value above 'pvalue'
# retain those significant above 'significant' on TARGET variance
# discard those with more than 'missing' missing values
def select_categorical(df, pvalue=0.05, significant=1, missing=20):
# CATEGORICAL VARS
# Get all categorical variables
categorical_var = df.select_dtypes('object')
# one-hot encoding of categorical variables
oh_categorical_var = pd.get_dummies(categorical_var)
# Get p_values and f_values to TAGET
categorical = categorical_summary(oh_categorical_var,df.TARGET,True,False)
display('> Categorical',categorical)
# Reject pvalues over pvalue
not_pvalue_rejected = categorical[categorical.p_value <= pvalue]
display('>>not_pvalue_rejected',not_pvalue_rejected)
# Select most significant
most_significant = not_pvalue_rejected.loc[not_pvalue_rejected['f_value'] > significant]
most_significant_cols = list(most_significant.index)
# display('>>>most_significant_cols',most_significant_cols)
df_cat = oh_categorical_var[most_significant_cols]
# display('>>>>df[most significant cols]',df_cat)
return df_cat
categorical_features = select_categorical(app_train, pvalue=0.05, significant=200, missing=20)
categorical_features
Data Frame a 140 colonnes. Dont 140 colonnes contiennent des valeurs manquantes.
'> Categorical'
| f_value | p_value | valeurs_manquantes | |
|---|---|---|---|
| NAME_INCOME_TYPE_Working | 1019.42 | 0.00 | 0.0 |
| NAME_EDUCATION_TYPE_Higher education | 988.04 | 0.00 | 0.0 |
| CODE_GENDER_M | 923.29 | 0.00 | 0.0 |
| CODE_GENDER_F | 922.97 | 0.00 | 0.0 |
| NAME_EDUCATION_TYPE_Secondary / secondary special | 765.26 | 0.00 | 0.0 |
| NAME_INCOME_TYPE_Pensioner | 658.03 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_XNA | 651.70 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Laborers | 570.14 | 0.00 | 0.0 |
| EMERGENCYSTATE_MODE_No | 548.63 | 0.00 | 0.0 |
| HOUSETYPE_MODE_block of flats | 507.58 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Panel | 337.67 | 0.00 | 0.0 |
| NAME_CONTRACT_TYPE_Revolving loans | 293.82 | 0.00 | 0.0 |
| NAME_CONTRACT_TYPE_Cash loans | 293.71 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Drivers | 282.65 | 0.00 | 0.0 |
| NAME_HOUSING_TYPE_With parents | 276.38 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Self-employed | 261.32 | 0.00 | 0.0 |
| NAME_HOUSING_TYPE_House / apartment | 250.88 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Low-skill Laborers | 234.27 | 0.00 | 0.0 |
| NAME_FAMILY_STATUS_Single / not married | 216.40 | 0.00 | 0.0 |
| NAME_FAMILY_STATUS_Married | 192.95 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Business Entity Type 3 | 177.12 | 0.00 | 0.0 |
| NAME_INCOME_TYPE_State servant | 169.15 | 0.00 | 0.0 |
| FONDKAPREMONT_MODE_reg oper account | 156.97 | 0.00 | 0.0 |
| NAME_FAMILY_STATUS_Civil marriage | 155.70 | 0.00 | 0.0 |
| FLAG_OWN_CAR_Y | 146.91 | 0.00 | 0.0 |
| FLAG_OWN_CAR_N | 146.88 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Accountants | 143.68 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Core staff | 127.75 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Construction | 120.50 | 0.00 | 0.0 |
| NAME_HOUSING_TYPE_Rented apartment | 120.20 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Sales staff | 117.42 | 0.00 | 0.0 |
| NAME_FAMILY_STATUS_Widow | 115.71 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Managers | 106.97 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Transport: type 3 | 94.76 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Security staff | 66.00 | 0.00 | 0.0 |
| OCCUPATION_TYPE_High skill tech staff | 58.28 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_School | 57.48 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Stone, brick | 49.26 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Cooking staff | 45.94 | 0.00 | 0.0 |
| NAME_INCOME_TYPE_Commercial associate | 43.59 | 0.00 | 0.0 |
| NAME_EDUCATION_TYPE_Lower secondary | 42.44 | 0.00 | 0.0 |
| FONDKAPREMONT_MODE_org spec account | 39.17 | 0.00 | 0.0 |
| FONDKAPREMONT_MODE_reg oper spec account | 38.97 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Medicine | 34.68 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Restaurant | 32.41 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Military | 31.11 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Police | 30.06 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 3 | 28.88 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Bank | 28.40 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Security Ministries | 27.58 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Monolithic | 27.08 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 3 | 24.42 | 0.00 | 0.0 |
| NAME_INCOME_TYPE_Unemployed | 23.73 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Medicine staff | 22.29 | 0.00 | 0.0 |
| NAME_TYPE_SUITE_Unaccompanied | 21.15 | 0.00 | 0.0 |
| NAME_TYPE_SUITE_Family | 20.81 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 7 | 20.52 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Wooden | 19.42 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Agriculture | 19.20 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Waiters/barmen staff | 18.72 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_University | 18.10 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Government | 17.39 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Security | 16.06 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Cleaning staff | 14.98 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Block | 14.12 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 1 | 12.61 | 0.00 | 0.0 |
| FLAG_OWN_REALTY_Y | 11.64 | 0.00 | 0.0 |
| FLAG_OWN_REALTY_N | 11.62 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Transport: type 4 | 10.81 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 6 | 10.30 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Kindergarten | 10.22 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 12 | 9.11 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 9 | 8.90 | 0.00 | 0.0 |
| HOUSETYPE_MODE_specific housing | 8.67 | 0.00 | 0.0 |
| NAME_EDUCATION_TYPE_Academic degree | 8.62 | 0.00 | 0.0 |
| WEEKDAY_APPR_PROCESS_START_MONDAY | 8.15 | 0.00 | 0.0 |
| NAME_HOUSING_TYPE_Office apartment | 8.01 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Private service staff | 7.83 | 0.01 | 0.0 |
| NAME_TYPE_SUITE_Other_B | 7.41 | 0.01 | 0.0 |
| EMERGENCYSTATE_MODE_Yes | 7.17 | 0.01 | 0.0 |
| NAME_INCOME_TYPE_Maternity leave | 6.87 | 0.01 | 0.0 |
| WEEKDAY_APPR_PROCESS_START_TUESDAY | 6.78 | 0.01 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 4 | 5.10 | 0.02 | 0.0 |
| ORGANIZATION_TYPE_Services | 4.61 | 0.03 | 0.0 |
| ORGANIZATION_TYPE_Insurance | 4.56 | 0.03 | 0.0 |
| ORGANIZATION_TYPE_Other | 4.40 | 0.04 | 0.0 |
| ORGANIZATION_TYPE_Hotel | 3.58 | 0.06 | 0.0 |
| ORGANIZATION_TYPE_Transport: type 1 | 3.50 | 0.06 | 0.0 |
| ORGANIZATION_TYPE_Realtor | 3.43 | 0.06 | 0.0 |
| NAME_HOUSING_TYPE_Municipal apartment | 3.41 | 0.06 | 0.0 |
| ORGANIZATION_TYPE_Cleaning | 3.33 | 0.07 | 0.0 |
| ORGANIZATION_TYPE_Culture | 3.28 | 0.07 | 0.0 |
| ORGANIZATION_TYPE_Business Entity Type 2 | 3.05 | 0.08 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 2 | 2.97 | 0.09 | 0.0 |
| ORGANIZATION_TYPE_Electricity | 2.67 | 0.10 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 13 | 2.59 | 0.11 | 0.0 |
| NAME_EDUCATION_TYPE_Incomplete higher | 2.43 | 0.12 | 0.0 |
| FONDKAPREMONT_MODE_not specified | 2.19 | 0.14 | 0.0 |
| NAME_TYPE_SUITE_Children | 2.16 | 0.14 | 0.0 |
| OCCUPATION_TYPE_HR staff | 2.14 | 0.14 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 4 | 2.11 | 0.15 | 0.0 |
| OCCUPATION_TYPE_Secretaries | 1.85 | 0.17 | 0.0 |
| OCCUPATION_TYPE_IT staff | 1.84 | 0.18 | 0.0 |
| WEEKDAY_APPR_PROCESS_START_SATURDAY | 1.77 | 0.18 | 0.0 |
| NAME_INCOME_TYPE_Student | 1.58 | 0.21 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 11 | 1.24 | 0.27 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 5 | 1.22 | 0.27 | 0.0 |
| WALLSMATERIAL_MODE_Mixed | 0.90 | 0.34 | 0.0 |
| NAME_INCOME_TYPE_Businessman | 0.88 | 0.35 | 0.0 |
| ORGANIZATION_TYPE_Emergency | 0.65 | 0.42 | 0.0 |
| NAME_TYPE_SUITE_Spouse, partner | 0.64 | 0.42 | 0.0 |
| WEEKDAY_APPR_PROCESS_START_WEDNESDAY | 0.64 | 0.42 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 8 | 0.63 | 0.43 | 0.0 |
| NAME_TYPE_SUITE_Other_A | 0.58 | 0.45 | 0.0 |
| ORGANIZATION_TYPE_Religion | 0.55 | 0.46 | 0.0 |
| ORGANIZATION_TYPE_Mobile | 0.49 | 0.48 | 0.0 |
| WEEKDAY_APPR_PROCESS_START_SUNDAY | 0.48 | 0.49 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 2 | 0.47 | 0.50 | 0.0 |
| WEEKDAY_APPR_PROCESS_START_FRIDAY | 0.44 | 0.51 | 0.0 |
| NAME_FAMILY_STATUS_Separated | 0.42 | 0.52 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 10 | 0.40 | 0.53 | 0.0 |
| ORGANIZATION_TYPE_Postal | 0.39 | 0.53 | 0.0 |
| CODE_GENDER_XNA | 0.35 | 0.55 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 1 | 0.33 | 0.57 | 0.0 |
| HOUSETYPE_MODE_terraced house | 0.30 | 0.59 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 5 | 0.25 | 0.62 | 0.0 |
| ORGANIZATION_TYPE_Transport: type 2 | 0.22 | 0.64 | 0.0 |
| NAME_FAMILY_STATUS_Unknown | 0.18 | 0.68 | 0.0 |
| ORGANIZATION_TYPE_Telecom | 0.16 | 0.69 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 6 | 0.13 | 0.72 | 0.0 |
| WALLSMATERIAL_MODE_Others | 0.12 | 0.73 | 0.0 |
| ORGANIZATION_TYPE_Housing | 0.07 | 0.80 | 0.0 |
| NAME_TYPE_SUITE_Group of people | 0.06 | 0.80 | 0.0 |
| WEEKDAY_APPR_PROCESS_START_THURSDAY | 0.05 | 0.82 | 0.0 |
| OCCUPATION_TYPE_Realty agents | 0.05 | 0.83 | 0.0 |
| ORGANIZATION_TYPE_Business Entity Type 1 | 0.03 | 0.85 | 0.0 |
| NAME_HOUSING_TYPE_Co-op apartment | 0.03 | 0.86 | 0.0 |
| ORGANIZATION_TYPE_Legal Services | 0.02 | 0.90 | 0.0 |
| ORGANIZATION_TYPE_Advertising | 0.00 | 0.95 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 7 | 0.00 | 0.96 | 0.0 |
'>>not_pvalue_rejected'
| f_value | p_value | valeurs_manquantes | |
|---|---|---|---|
| NAME_INCOME_TYPE_Working | 1019.42 | 0.00 | 0.0 |
| NAME_EDUCATION_TYPE_Higher education | 988.04 | 0.00 | 0.0 |
| CODE_GENDER_M | 923.29 | 0.00 | 0.0 |
| CODE_GENDER_F | 922.97 | 0.00 | 0.0 |
| NAME_EDUCATION_TYPE_Secondary / secondary special | 765.26 | 0.00 | 0.0 |
| NAME_INCOME_TYPE_Pensioner | 658.03 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_XNA | 651.70 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Laborers | 570.14 | 0.00 | 0.0 |
| EMERGENCYSTATE_MODE_No | 548.63 | 0.00 | 0.0 |
| HOUSETYPE_MODE_block of flats | 507.58 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Panel | 337.67 | 0.00 | 0.0 |
| NAME_CONTRACT_TYPE_Revolving loans | 293.82 | 0.00 | 0.0 |
| NAME_CONTRACT_TYPE_Cash loans | 293.71 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Drivers | 282.65 | 0.00 | 0.0 |
| NAME_HOUSING_TYPE_With parents | 276.38 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Self-employed | 261.32 | 0.00 | 0.0 |
| NAME_HOUSING_TYPE_House / apartment | 250.88 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Low-skill Laborers | 234.27 | 0.00 | 0.0 |
| NAME_FAMILY_STATUS_Single / not married | 216.40 | 0.00 | 0.0 |
| NAME_FAMILY_STATUS_Married | 192.95 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Business Entity Type 3 | 177.12 | 0.00 | 0.0 |
| NAME_INCOME_TYPE_State servant | 169.15 | 0.00 | 0.0 |
| FONDKAPREMONT_MODE_reg oper account | 156.97 | 0.00 | 0.0 |
| NAME_FAMILY_STATUS_Civil marriage | 155.70 | 0.00 | 0.0 |
| FLAG_OWN_CAR_Y | 146.91 | 0.00 | 0.0 |
| FLAG_OWN_CAR_N | 146.88 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Accountants | 143.68 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Core staff | 127.75 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Construction | 120.50 | 0.00 | 0.0 |
| NAME_HOUSING_TYPE_Rented apartment | 120.20 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Sales staff | 117.42 | 0.00 | 0.0 |
| NAME_FAMILY_STATUS_Widow | 115.71 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Managers | 106.97 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Transport: type 3 | 94.76 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Security staff | 66.00 | 0.00 | 0.0 |
| OCCUPATION_TYPE_High skill tech staff | 58.28 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_School | 57.48 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Stone, brick | 49.26 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Cooking staff | 45.94 | 0.00 | 0.0 |
| NAME_INCOME_TYPE_Commercial associate | 43.59 | 0.00 | 0.0 |
| NAME_EDUCATION_TYPE_Lower secondary | 42.44 | 0.00 | 0.0 |
| FONDKAPREMONT_MODE_org spec account | 39.17 | 0.00 | 0.0 |
| FONDKAPREMONT_MODE_reg oper spec account | 38.97 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Medicine | 34.68 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Restaurant | 32.41 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Military | 31.11 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Police | 30.06 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 3 | 28.88 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Bank | 28.40 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Security Ministries | 27.58 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Monolithic | 27.08 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 3 | 24.42 | 0.00 | 0.0 |
| NAME_INCOME_TYPE_Unemployed | 23.73 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Medicine staff | 22.29 | 0.00 | 0.0 |
| NAME_TYPE_SUITE_Unaccompanied | 21.15 | 0.00 | 0.0 |
| NAME_TYPE_SUITE_Family | 20.81 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 7 | 20.52 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Wooden | 19.42 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Agriculture | 19.20 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Waiters/barmen staff | 18.72 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_University | 18.10 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Government | 17.39 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Security | 16.06 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Cleaning staff | 14.98 | 0.00 | 0.0 |
| WALLSMATERIAL_MODE_Block | 14.12 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 1 | 12.61 | 0.00 | 0.0 |
| FLAG_OWN_REALTY_Y | 11.64 | 0.00 | 0.0 |
| FLAG_OWN_REALTY_N | 11.62 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Transport: type 4 | 10.81 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Trade: type 6 | 10.30 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Kindergarten | 10.22 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 12 | 9.11 | 0.00 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 9 | 8.90 | 0.00 | 0.0 |
| HOUSETYPE_MODE_specific housing | 8.67 | 0.00 | 0.0 |
| NAME_EDUCATION_TYPE_Academic degree | 8.62 | 0.00 | 0.0 |
| WEEKDAY_APPR_PROCESS_START_MONDAY | 8.15 | 0.00 | 0.0 |
| NAME_HOUSING_TYPE_Office apartment | 8.01 | 0.00 | 0.0 |
| OCCUPATION_TYPE_Private service staff | 7.83 | 0.01 | 0.0 |
| NAME_TYPE_SUITE_Other_B | 7.41 | 0.01 | 0.0 |
| EMERGENCYSTATE_MODE_Yes | 7.17 | 0.01 | 0.0 |
| NAME_INCOME_TYPE_Maternity leave | 6.87 | 0.01 | 0.0 |
| WEEKDAY_APPR_PROCESS_START_TUESDAY | 6.78 | 0.01 | 0.0 |
| ORGANIZATION_TYPE_Industry: type 4 | 5.10 | 0.02 | 0.0 |
| ORGANIZATION_TYPE_Services | 4.61 | 0.03 | 0.0 |
| ORGANIZATION_TYPE_Insurance | 4.56 | 0.03 | 0.0 |
| ORGANIZATION_TYPE_Other | 4.40 | 0.04 | 0.0 |
| NAME_INCOME_TYPE_Working | NAME_EDUCATION_TYPE_Higher education | CODE_GENDER_M | CODE_GENDER_F | NAME_EDUCATION_TYPE_Secondary / secondary special | NAME_INCOME_TYPE_Pensioner | ORGANIZATION_TYPE_XNA | OCCUPATION_TYPE_Laborers | EMERGENCYSTATE_MODE_No | HOUSETYPE_MODE_block of flats | WALLSMATERIAL_MODE_Panel | NAME_CONTRACT_TYPE_Revolving loans | NAME_CONTRACT_TYPE_Cash loans | OCCUPATION_TYPE_Drivers | NAME_HOUSING_TYPE_With parents | ORGANIZATION_TYPE_Self-employed | NAME_HOUSING_TYPE_House / apartment | OCCUPATION_TYPE_Low-skill Laborers | NAME_FAMILY_STATUS_Single / not married | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||
| 100002 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 100003 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 100004 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 100006 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 100007 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456251 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 456252 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456253 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456254 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456255 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
307511 rows × 19 columns
categorical_features.columns.to_list()
['NAME_INCOME_TYPE_Working', 'NAME_EDUCATION_TYPE_Higher education', 'CODE_GENDER_M', 'CODE_GENDER_F', 'NAME_EDUCATION_TYPE_Secondary / secondary special', 'NAME_INCOME_TYPE_Pensioner', 'ORGANIZATION_TYPE_XNA', 'OCCUPATION_TYPE_Laborers', 'EMERGENCYSTATE_MODE_No', 'HOUSETYPE_MODE_block of flats', 'WALLSMATERIAL_MODE_Panel', 'NAME_CONTRACT_TYPE_Revolving loans', 'NAME_CONTRACT_TYPE_Cash loans', 'OCCUPATION_TYPE_Drivers', 'NAME_HOUSING_TYPE_With parents', 'ORGANIZATION_TYPE_Self-employed', 'NAME_HOUSING_TYPE_House / apartment', 'OCCUPATION_TYPE_Low-skill Laborers', 'NAME_FAMILY_STATUS_Single / not married']
features = numerical_features.join(categorical_features, how='left', on='SK_ID_CURR')
display(features)
features.shape
| EXT_SOURCE_3 | EXT_SOURCE_2 | CREDIT_ACTIVE_Closed_MEAN | PREV_CODE_REJECT_REASON_XAP_MEAN | PREV_PREV_APP_XAP_MEAN | DAYS_WORKING_PER | PREV_NAME_CONTRACT_STATUS_Approved_MEAN | GOODS_PRICE_EMP | FLAG_EMP_PHONE | PREV_DAYS_DECISION_MEAN | DAYS_CREDIT_ENDDATE_MEAN | AMT_CREDIT_DAYS_EMPLOYED_PERC | REG_CITY_NOT_WORK_CITY | DAYS_ID_PUBLISH | DAYS_ENDDATE_FACT_MEAN | PREV_CODE_REJECT_REASON_HC_MEAN | DAYS_LAST_PHONE_CHANGE | POS_MONTHS_BALANCE_MIN | PREV_CODE_REJECT_REASON_SCOFR_MEAN | PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | ANNUITY_DAYS_EMPLOYED_PERC | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | DAYS_CREDIT_UPDATE_MEAN | GOODS_PRICE_CREDIT_PER | MAX_DAYS_SOMETHING_CHANGED | DAYS_EMPLOYED | DAYS_CREDIT_MIN | CREDIT_ACTIVE_Active_MEAN | PREV_NAME_CONTRACT_STATUS_Refused_MEAN | DAYS_BIRTH | DAYS_CREDIT_MEAN | TARGET | NAME_INCOME_TYPE_Working | NAME_EDUCATION_TYPE_Higher education | CODE_GENDER_M | CODE_GENDER_F | NAME_EDUCATION_TYPE_Secondary / secondary special | NAME_INCOME_TYPE_Pensioner | ORGANIZATION_TYPE_XNA | OCCUPATION_TYPE_Laborers | EMERGENCYSTATE_MODE_No | HOUSETYPE_MODE_block of flats | WALLSMATERIAL_MODE_Panel | NAME_CONTRACT_TYPE_Revolving loans | NAME_CONTRACT_TYPE_Cash loans | OCCUPATION_TYPE_Drivers | NAME_HOUSING_TYPE_With parents | ORGANIZATION_TYPE_Self-employed | NAME_HOUSING_TYPE_House / apartment | OCCUPATION_TYPE_Low-skill Laborers | NAME_FAMILY_STATUS_Single / not married | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100002 | 0.139376 | 0.262949 | 0.750000 | 1.000000 | 1.000000 | 0.067329 | 1.000000 | 0.001815 | 1 | -606.000000 | -349.000000 | -0.001567 | 0 | -2120 | -697.500000 | 0.00 | -1134.0 | -19.0 | 0.0 | 0.000000 | -0.025789 | 2 | 2 | -499.875000 | 1.158397 | -637.0 | -637.0 | -1437.0 | 0.250000 | 0.000000 | -9461 | -874.000000 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 100003 | NaN | 0.622246 | 0.750000 | 1.000000 | 1.000000 | 0.070862 | 1.000000 | 0.001052 | 1 | -1305.000000 | -544.500000 | -0.000918 | 0 | -291 | -1097.333333 | 0.00 | -828.0 | -77.0 | 0.0 | 0.000000 | -0.033279 | 1 | 1 | -816.000000 | 1.145199 | -291.0 | -1188.0 | -2586.0 | 0.250000 | 0.000000 | -16765 | -1400.750000 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 100004 | 0.729567 | 0.555912 | 1.000000 | 1.000000 | 1.000000 | 0.011814 | 1.000000 | 0.001667 | 1 | -815.000000 | -488.500000 | -0.001667 | 0 | -2531 | -532.500000 | 0.00 | -815.0 | -27.0 | 0.0 | 0.000000 | -0.033333 | 2 | 2 | -532.000000 | 1.000000 | -225.0 | -225.0 | -1326.0 | 0.000000 | 0.000000 | -19046 | -867.000000 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 100006 | NaN | 0.650442 | NaN | 0.888889 | 0.888889 | 0.159905 | 0.555556 | 0.010232 | 1 | -272.444444 | NaN | -0.009719 | 0 | -2437 | NaN | 0.00 | -617.0 | -20.0 | 0.0 | 0.000000 | -0.102370 | 2 | 2 | NaN | 1.052803 | -2437.0 | -3039.0 | NaN | NaN | 0.111111 | -19005 | NaN | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 100007 | NaN | 0.322738 | 1.000000 | 1.000000 | 1.000000 | 0.152418 | 1.000000 | 0.005922 | 1 | -1222.833333 | -783.000000 | -0.005922 | 1 | -3458 | -783.000000 | 0.00 | -1106.0 | -77.0 | 0.0 | 0.166667 | -0.138940 | 2 | 2 | -783.000000 | 1.000000 | -3038.0 | -3038.0 | -1149.0 | 0.000000 | 0.000000 | -19932 | -1149.000000 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456251 | NaN | 0.681632 | NaN | 1.000000 | 1.000000 | 0.025303 | 1.000000 | 0.001049 | 1 | -273.000000 | NaN | -0.000927 | 0 | -1982 | NaN | 0.00 | -273.0 | -9.0 | 0.0 | 0.000000 | -0.008564 | 1 | 1 | NaN | 1.132000 | -236.0 | -236.0 | NaN | NaN | 0.000000 | -9327 | NaN | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 456252 | NaN | 0.115992 | NaN | 1.000000 | 1.000000 | NaN | 1.000000 | NaN | 0 | -2497.000000 | NaN | NaN | 0 | -4090 | NaN | 0.00 | 0.0 | -82.0 | 0.0 | 0.000000 | NaN | 2 | 2 | NaN | 1.198000 | -4090.0 | NaN | NaN | NaN | 0.000000 | -20775 | NaN | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456253 | 0.218859 | 0.535722 | 0.500000 | 1.000000 | 1.000000 | 0.529266 | 1.000000 | 0.013540 | 1 | -2380.000000 | 280.500000 | -0.011689 | 1 | -5150 | -794.000000 | 0.00 | -1909.0 | -96.0 | 0.0 | 0.000000 | -0.264218 | 3 | 3 | -253.250000 | 1.158400 | -5150.0 | -7921.0 | -919.0 | 0.500000 | 0.000000 | -14966 | -867.500000 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456254 | 0.661024 | 0.514163 | 1.000000 | 1.000000 | 1.000000 | 0.400134 | 1.000000 | 0.014980 | 1 | -299.500000 | -859.000000 | -0.012931 | 1 | -931 | -859.000000 | 0.00 | -322.0 | -11.0 | 0.0 | 0.000000 | -0.236872 | 2 | 2 | -401.000000 | 1.158394 | -931.0 | -4786.0 | -1104.0 | 0.000000 | 0.000000 | -11961 | -1104.000000 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456255 | 0.113922 | 0.708569 | 0.545455 | 0.750000 | 0.750000 | 0.074869 | 0.750000 | 0.001870 | 1 | -587.625000 | 3231.272727 | -0.001870 | 1 | -410 | -968.333333 | 0.25 | -787.0 | -33.0 | 0.0 | 0.250000 | -0.025693 | 1 | 1 | -531.090909 | 1.000000 | -410.0 | -1262.0 | -2337.0 | 0.454545 | 0.250000 | -16856 | -1089.454545 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
307510 rows × 52 columns
(307510, 52)
# Drop nan
features.dropna(axis=0, inplace=True)
display(features)
| EXT_SOURCE_3 | EXT_SOURCE_2 | CREDIT_ACTIVE_Closed_MEAN | PREV_CODE_REJECT_REASON_XAP_MEAN | PREV_PREV_APP_XAP_MEAN | DAYS_WORKING_PER | PREV_NAME_CONTRACT_STATUS_Approved_MEAN | GOODS_PRICE_EMP | FLAG_EMP_PHONE | PREV_DAYS_DECISION_MEAN | DAYS_CREDIT_ENDDATE_MEAN | AMT_CREDIT_DAYS_EMPLOYED_PERC | REG_CITY_NOT_WORK_CITY | DAYS_ID_PUBLISH | DAYS_ENDDATE_FACT_MEAN | PREV_CODE_REJECT_REASON_HC_MEAN | DAYS_LAST_PHONE_CHANGE | POS_MONTHS_BALANCE_MIN | PREV_CODE_REJECT_REASON_SCOFR_MEAN | PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | ANNUITY_DAYS_EMPLOYED_PERC | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | DAYS_CREDIT_UPDATE_MEAN | GOODS_PRICE_CREDIT_PER | MAX_DAYS_SOMETHING_CHANGED | DAYS_EMPLOYED | DAYS_CREDIT_MIN | CREDIT_ACTIVE_Active_MEAN | PREV_NAME_CONTRACT_STATUS_Refused_MEAN | DAYS_BIRTH | DAYS_CREDIT_MEAN | TARGET | NAME_INCOME_TYPE_Working | NAME_EDUCATION_TYPE_Higher education | CODE_GENDER_M | CODE_GENDER_F | NAME_EDUCATION_TYPE_Secondary / secondary special | NAME_INCOME_TYPE_Pensioner | ORGANIZATION_TYPE_XNA | OCCUPATION_TYPE_Laborers | EMERGENCYSTATE_MODE_No | HOUSETYPE_MODE_block of flats | WALLSMATERIAL_MODE_Panel | NAME_CONTRACT_TYPE_Revolving loans | NAME_CONTRACT_TYPE_Cash loans | OCCUPATION_TYPE_Drivers | NAME_HOUSING_TYPE_With parents | ORGANIZATION_TYPE_Self-employed | NAME_HOUSING_TYPE_House / apartment | OCCUPATION_TYPE_Low-skill Laborers | NAME_FAMILY_STATUS_Single / not married | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100002 | 0.139376 | 0.262949 | 0.750000 | 1.00 | 1.00 | 0.067329 | 1.00 | 0.001815 | 1 | -606.000000 | -349.000000 | -0.001567 | 0 | -2120 | -697.500000 | 0.00 | -1134.0 | -19.0 | 0.0 | 0.00 | -0.025789 | 2 | 2 | -499.875000 | 1.158397 | -637.0 | -637.0 | -1437.0 | 0.250000 | 0.00 | -9461 | -874.000000 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 100004 | 0.729567 | 0.555912 | 1.000000 | 1.00 | 1.00 | 0.011814 | 1.00 | 0.001667 | 1 | -815.000000 | -488.500000 | -0.001667 | 0 | -2531 | -532.500000 | 0.00 | -815.0 | -27.0 | 0.0 | 0.00 | -0.033333 | 2 | 2 | -532.000000 | 1.000000 | -225.0 | -225.0 | -1326.0 | 0.000000 | 0.00 | -19046 | -867.000000 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 100008 | 0.621226 | 0.354225 | 0.666667 | 1.00 | 1.00 | 0.093737 | 0.80 | 0.003494 | 1 | -1192.000000 | -391.333333 | -0.003238 | 0 | -477 | -909.000000 | 0.00 | -2536.0 | -84.0 | 0.0 | 0.00 | -0.057709 | 2 | 2 | -611.000000 | 1.079198 | -477.0 | -1588.0 | -1097.0 | 0.333333 | 0.00 | -16941 | -757.333333 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 100009 | 0.492060 | 0.724000 | 0.777778 | 1.00 | 1.00 | 0.227174 | 1.00 | 0.002244 | 1 | -719.285714 | -794.937500 | -0.002005 | 0 | -619 | -1108.500000 | 0.00 | -1562.0 | -96.0 | 0.0 | 0.00 | -0.075785 | 2 | 2 | -851.611111 | 1.118800 | -619.0 | -3130.0 | -2882.0 | 0.222222 | 0.00 | -13778 | -1271.500000 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 100010 | 0.540654 | 0.714279 | 0.500000 | 1.00 | 1.00 | 0.023820 | 1.00 | 0.000293 | 1 | -1070.000000 | -119.500000 | -0.000293 | 1 | -2379 | -1138.000000 | 0.00 | -1070.0 | -35.0 | 0.0 | 0.00 | -0.010671 | 3 | 3 | -578.000000 | 1.000000 | -449.0 | -449.0 | -2741.0 | 0.500000 | 0.00 | -18850 | -1939.500000 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456246 | 0.735221 | 0.313334 | 0.666667 | 1.00 | 1.00 | 0.543859 | 1.00 | 0.038827 | 1 | -897.000000 | -234.666667 | -0.038827 | 1 | -4531 | -1331.000000 | 0.00 | -1516.0 | -50.0 | 0.0 | 0.00 | -0.822599 | 2 | 2 | -373.333333 | 1.000000 | -4531.0 | -8736.0 | -2141.0 | 0.333333 | 0.00 | -16063 | -1361.000000 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456247 | 0.609276 | 0.501221 | 0.727273 | 0.80 | 0.80 | 0.033614 | 0.80 | 0.001612 | 1 | -1387.800000 | 1449.818182 | -0.001155 | 0 | -3936 | -1085.000000 | 0.20 | -2315.0 | -95.0 | 0.0 | 0.00 | -0.022453 | 2 | 2 | -768.818182 | 1.396000 | -399.0 | -399.0 | -2482.0 | 0.272727 | 0.20 | -11870 | -1043.181818 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 456253 | 0.218859 | 0.535722 | 0.500000 | 1.00 | 1.00 | 0.529266 | 1.00 | 0.013540 | 1 | -2380.000000 | 280.500000 | -0.011689 | 1 | -5150 | -794.000000 | 0.00 | -1909.0 | -96.0 | 0.0 | 0.00 | -0.264218 | 3 | 3 | -253.250000 | 1.158400 | -5150.0 | -7921.0 | -919.0 | 0.500000 | 0.00 | -14966 | -867.500000 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456254 | 0.661024 | 0.514163 | 1.000000 | 1.00 | 1.00 | 0.400134 | 1.00 | 0.014980 | 1 | -299.500000 | -859.000000 | -0.012931 | 1 | -931 | -859.000000 | 0.00 | -322.0 | -11.0 | 0.0 | 0.00 | -0.236872 | 2 | 2 | -401.000000 | 1.158394 | -931.0 | -4786.0 | -1104.0 | 0.000000 | 0.00 | -11961 | -1104.000000 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456255 | 0.113922 | 0.708569 | 0.545455 | 0.75 | 0.75 | 0.074869 | 0.75 | 0.001870 | 1 | -587.625000 | 3231.272727 | -0.001870 | 1 | -410 | -968.333333 | 0.25 | -787.0 | -33.0 | 0.0 | 0.25 | -0.025693 | 1 | 1 | -531.090909 | 1.000000 | -410.0 | -1262.0 | -2337.0 | 0.454545 | 0.25 | -16856 | -1089.454545 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
166167 rows × 52 columns
# Drop nan
features.dropna(axis=0, inplace=True)
display(features)
| EXT_SOURCE_3 | EXT_SOURCE_2 | CREDIT_ACTIVE_Closed_MEAN | PREV_CODE_REJECT_REASON_XAP_MEAN | PREV_PREV_APP_XAP_MEAN | DAYS_WORKING_PER | PREV_NAME_CONTRACT_STATUS_Approved_MEAN | GOODS_PRICE_EMP | FLAG_EMP_PHONE | PREV_DAYS_DECISION_MEAN | DAYS_CREDIT_ENDDATE_MEAN | AMT_CREDIT_DAYS_EMPLOYED_PERC | REG_CITY_NOT_WORK_CITY | DAYS_ID_PUBLISH | DAYS_ENDDATE_FACT_MEAN | PREV_CODE_REJECT_REASON_HC_MEAN | DAYS_LAST_PHONE_CHANGE | POS_MONTHS_BALANCE_MIN | PREV_CODE_REJECT_REASON_SCOFR_MEAN | PREV_NAME_PRODUCT_TYPE_walk-in_MEAN | ANNUITY_DAYS_EMPLOYED_PERC | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | DAYS_CREDIT_UPDATE_MEAN | GOODS_PRICE_CREDIT_PER | MAX_DAYS_SOMETHING_CHANGED | DAYS_EMPLOYED | DAYS_CREDIT_MIN | CREDIT_ACTIVE_Active_MEAN | PREV_NAME_CONTRACT_STATUS_Refused_MEAN | DAYS_BIRTH | DAYS_CREDIT_MEAN | TARGET | NAME_INCOME_TYPE_Working | NAME_EDUCATION_TYPE_Higher education | CODE_GENDER_M | CODE_GENDER_F | NAME_EDUCATION_TYPE_Secondary / secondary special | NAME_INCOME_TYPE_Pensioner | ORGANIZATION_TYPE_XNA | OCCUPATION_TYPE_Laborers | EMERGENCYSTATE_MODE_No | HOUSETYPE_MODE_block of flats | WALLSMATERIAL_MODE_Panel | NAME_CONTRACT_TYPE_Revolving loans | NAME_CONTRACT_TYPE_Cash loans | OCCUPATION_TYPE_Drivers | NAME_HOUSING_TYPE_With parents | ORGANIZATION_TYPE_Self-employed | NAME_HOUSING_TYPE_House / apartment | OCCUPATION_TYPE_Low-skill Laborers | NAME_FAMILY_STATUS_Single / not married | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100002 | 0.139376 | 0.262949 | 0.750000 | 1.00 | 1.00 | 0.067329 | 1.00 | 0.001815 | 1 | -606.000000 | -349.000000 | -0.001567 | 0 | -2120 | -697.500000 | 0.00 | -1134.0 | -19.0 | 0.0 | 0.00 | -0.025789 | 2 | 2 | -499.875000 | 1.158397 | -637.0 | -637.0 | -1437.0 | 0.250000 | 0.00 | -9461 | -874.000000 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 100004 | 0.729567 | 0.555912 | 1.000000 | 1.00 | 1.00 | 0.011814 | 1.00 | 0.001667 | 1 | -815.000000 | -488.500000 | -0.001667 | 0 | -2531 | -532.500000 | 0.00 | -815.0 | -27.0 | 0.0 | 0.00 | -0.033333 | 2 | 2 | -532.000000 | 1.000000 | -225.0 | -225.0 | -1326.0 | 0.000000 | 0.00 | -19046 | -867.000000 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 100008 | 0.621226 | 0.354225 | 0.666667 | 1.00 | 1.00 | 0.093737 | 0.80 | 0.003494 | 1 | -1192.000000 | -391.333333 | -0.003238 | 0 | -477 | -909.000000 | 0.00 | -2536.0 | -84.0 | 0.0 | 0.00 | -0.057709 | 2 | 2 | -611.000000 | 1.079198 | -477.0 | -1588.0 | -1097.0 | 0.333333 | 0.00 | -16941 | -757.333333 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 100009 | 0.492060 | 0.724000 | 0.777778 | 1.00 | 1.00 | 0.227174 | 1.00 | 0.002244 | 1 | -719.285714 | -794.937500 | -0.002005 | 0 | -619 | -1108.500000 | 0.00 | -1562.0 | -96.0 | 0.0 | 0.00 | -0.075785 | 2 | 2 | -851.611111 | 1.118800 | -619.0 | -3130.0 | -2882.0 | 0.222222 | 0.00 | -13778 | -1271.500000 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 100010 | 0.540654 | 0.714279 | 0.500000 | 1.00 | 1.00 | 0.023820 | 1.00 | 0.000293 | 1 | -1070.000000 | -119.500000 | -0.000293 | 1 | -2379 | -1138.000000 | 0.00 | -1070.0 | -35.0 | 0.0 | 0.00 | -0.010671 | 3 | 3 | -578.000000 | 1.000000 | -449.0 | -449.0 | -2741.0 | 0.500000 | 0.00 | -18850 | -1939.500000 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 456246 | 0.735221 | 0.313334 | 0.666667 | 1.00 | 1.00 | 0.543859 | 1.00 | 0.038827 | 1 | -897.000000 | -234.666667 | -0.038827 | 1 | -4531 | -1331.000000 | 0.00 | -1516.0 | -50.0 | 0.0 | 0.00 | -0.822599 | 2 | 2 | -373.333333 | 1.000000 | -4531.0 | -8736.0 | -2141.0 | 0.333333 | 0.00 | -16063 | -1361.000000 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456247 | 0.609276 | 0.501221 | 0.727273 | 0.80 | 0.80 | 0.033614 | 0.80 | 0.001612 | 1 | -1387.800000 | 1449.818182 | -0.001155 | 0 | -3936 | -1085.000000 | 0.20 | -2315.0 | -95.0 | 0.0 | 0.00 | -0.022453 | 2 | 2 | -768.818182 | 1.396000 | -399.0 | -399.0 | -2482.0 | 0.272727 | 0.20 | -11870 | -1043.181818 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 456253 | 0.218859 | 0.535722 | 0.500000 | 1.00 | 1.00 | 0.529266 | 1.00 | 0.013540 | 1 | -2380.000000 | 280.500000 | -0.011689 | 1 | -5150 | -794.000000 | 0.00 | -1909.0 | -96.0 | 0.0 | 0.00 | -0.264218 | 3 | 3 | -253.250000 | 1.158400 | -5150.0 | -7921.0 | -919.0 | 0.500000 | 0.00 | -14966 | -867.500000 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456254 | 0.661024 | 0.514163 | 1.000000 | 1.00 | 1.00 | 0.400134 | 1.00 | 0.014980 | 1 | -299.500000 | -859.000000 | -0.012931 | 1 | -931 | -859.000000 | 0.00 | -322.0 | -11.0 | 0.0 | 0.00 | -0.236872 | 2 | 2 | -401.000000 | 1.158394 | -931.0 | -4786.0 | -1104.0 | 0.000000 | 0.00 | -11961 | -1104.000000 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 456255 | 0.113922 | 0.708569 | 0.545455 | 0.75 | 0.75 | 0.074869 | 0.75 | 0.001870 | 1 | -587.625000 | 3231.272727 | -0.001870 | 1 | -410 | -968.333333 | 0.25 | -787.0 | -33.0 | 0.0 | 0.25 | -0.025693 | 1 | 1 | -531.090909 | 1.000000 | -410.0 | -1262.0 | -2337.0 | 0.454545 | 0.25 | -16856 | -1089.454545 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
166167 rows × 52 columns
# Remove invariant features
from sklearn.feature_selection import VarianceThreshold
zero_var = VarianceThreshold(threshold=0.0)
zero_var.fit(features)
zero_var_cols = features.columns[zero_var.get_support()==False]
print(zero_var_cols)
features.drop(zero_var_cols, axis=1, inplace=True)
Index(['ORGANIZATION_TYPE_XNA'], dtype='object')
check for pairs of variables witch are self correlated above 65%
# Remove autocorelateted features threshold .7
correlations = correlated_features(features, 0.65)
correlations
| feat_1 | feat_1 corr to target | feat_2 | feat_2 corr to target | feat_1 vs feat_2 | |
|---|---|---|---|---|---|
| 0 | PREV_PREV_APP_XAP_MEAN | -0.077589 | PREV_CODE_REJECT_REASON_XAP_MEAN | -0.077589 | 1.000000 |
| 1 | PREV_NAME_CONTRACT_STATUS_Approved_MEAN | -0.072954 | PREV_CODE_REJECT_REASON_XAP_MEAN | -0.072954 | 0.719522 |
| 2 | PREV_NAME_CONTRACT_STATUS_Approved_MEAN | -0.072954 | PREV_PREV_APP_XAP_MEAN | -0.072954 | 0.719522 |
| 3 | POS_MONTHS_BALANCE_MIN | 0.046423 | PREV_DAYS_DECISION_MEAN | 0.046423 | 0.693314 |
| 4 | ANNUITY_DAYS_EMPLOYED_PERC | 0.056607 | AMT_CREDIT_DAYS_EMPLOYED_PERC | 0.056607 | 0.896973 |
| 5 | REGION_RATING_CLIENT_W_CITY | 0.057790 | REGION_RATING_CLIENT | 0.057790 | 0.951702 |
| 6 | DAYS_CREDIT_UPDATE_MEAN | 0.064906 | DAYS_ENDDATE_FACT_MEAN | 0.064906 | 0.703742 |
| 7 | DAYS_EMPLOYED | 0.071078 | ANNUITY_DAYS_EMPLOYED_PERC | 0.071078 | 0.729086 |
| 8 | DAYS_CREDIT_MIN | 0.069053 | DAYS_ENDDATE_FACT_MEAN | 0.069053 | 0.727464 |
| 9 | PREV_NAME_CONTRACT_STATUS_Refused_MEAN | 0.081782 | PREV_CODE_REJECT_REASON_HC_MEAN | 0.081782 | 0.700142 |
| 10 | DAYS_CREDIT_MEAN | 0.087448 | DAYS_ENDDATE_FACT_MEAN | 0.087448 | 0.781536 |
| 11 | DAYS_CREDIT_MEAN | 0.087448 | DAYS_CREDIT_UPDATE_MEAN | 0.087448 | 0.720371 |
| 12 | DAYS_CREDIT_MEAN | 0.087448 | DAYS_CREDIT_MIN | 0.087448 | 0.770859 |
| 13 | HOUSETYPE_MODE_block of flats | -0.035085 | EMERGENCYSTATE_MODE_No | -0.035085 | 0.913147 |
Je choisis de garder les variables de la colonne feat_2
features.drop(set(correlations.feat_1.values), axis=1, inplace=True)
features.shape
(166167, 40)
frame_vs_target(features.drop('TARGET',axis=1),app_train.TARGET,'All variables for modelisation by TARGET value ')
multi_violin(features, 'Variables distributions')
msno.matrix(features)
<AxesSubplot:>
features['TARGET'].value_counts()
0 153076 1 13091 Name: TARGET, dtype: int64
# Class ratio
print('TARGET ratio {:.8%}'.format(features.TARGET.value_counts()[1]/features.TARGET.value_counts()[0]) )
TARGET ratio 8.55196112%
features['TARGET'].astype(int).plot.hist();
path='FEATURES/'
joblib.dump(features, path+'EDA 1-7 all features_aggregated.feat', compress=3)
['FEATURES/EDA 1-7 all features_aggregated.feat']